From f8e85b149939eea296805a680c080bbac884f269 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Sat, 23 Nov 2024 13:52:34 +0530
Subject: [PATCH] Fixed a bug in _process_links, handled condition for when
 url_scorer is passed as None, renamed the scrapper folder to scraper.

---
 crawl4ai/scraper/bfs_scraper_strategy.py      | 34 +++++++++++--------
 .../async_web_scraper.md                      |  0
 .../bfs_scraper_strategy.md                   |  0
 .../{scrapper => scraper}/filters_scrorers.md |  0
 docs/{scrapper => scraper}/how_to_use.md      |  0
 .../scraper_quickstart.py                     | 32 ++++++++---------
 6 files changed, 35 insertions(+), 31 deletions(-)
 rename docs/{scrapper => scraper}/async_web_scraper.md (100%)
 rename docs/{scrapper => scraper}/bfs_scraper_strategy.md (100%)
 rename docs/{scrapper => scraper}/filters_scrorers.md (100%)
 rename docs/{scrapper => scraper}/how_to_use.md (100%)
 rename docs/{scrapper => scraper}/scraper_quickstart.py (88%)

diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py
index cae7ba90..72a86203 100644
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
         links_ro_process = result.links["internal"]
         if self.process_external_links:
             links_ro_process += result.links["external"]
-        for link_type in links_ro_process:
-            for link in result.links[link_type]:
-                url = link['href']
-                # url = urljoin(source_url, link['href'])
-                # url = urlunparse(urlparse(url)._replace(fragment=""))
-                
-                if url not in visited and await self.can_process_url(url):
-                    new_depth = depths[source_url] + 1
-                    if new_depth <= self.max_depth:
+        for link in links_ro_process:
+            url = link['href']
+            # url = urljoin(source_url, link['href'])
+            # url = urlunparse(urlparse(url)._replace(fragment=""))
+            
+            if url not in visited and await self.can_process_url(url):
+                new_depth = depths[source_url] + 1
+                if new_depth <= self.max_depth:
+                    if self.url_scorer:
                         score = self.url_scorer.score(url)
-                        await queue.put((score, new_depth, url))
-                        depths[url] = new_depth
-                        self.stats.total_depth_reached = max(
-                            self.stats.total_depth_reached, 
-                            new_depth
-                        )
+                    else:
+                        # When no url_scorer is provided all urls will have same score of 0.
+                        # Therefore will be process in FIFO order as per URL depth
+                        score = 0
+                    await queue.put((score, new_depth, url))
+                    depths[url] = new_depth
+                    self.stats.total_depth_reached = max(
+                        self.stats.total_depth_reached, 
+                        new_depth
+                    )
 
     async def ascrape(
         self,
diff --git a/docs/scrapper/async_web_scraper.md b/docs/scraper/async_web_scraper.md
similarity index 100%
rename from docs/scrapper/async_web_scraper.md
rename to docs/scraper/async_web_scraper.md
diff --git a/docs/scrapper/bfs_scraper_strategy.md b/docs/scraper/bfs_scraper_strategy.md
similarity index 100%
rename from docs/scrapper/bfs_scraper_strategy.md
rename to docs/scraper/bfs_scraper_strategy.md
diff --git a/docs/scrapper/filters_scrorers.md b/docs/scraper/filters_scrorers.md
similarity index 100%
rename from docs/scrapper/filters_scrorers.md
rename to docs/scraper/filters_scrorers.md
diff --git a/docs/scrapper/how_to_use.md b/docs/scraper/how_to_use.md
similarity index 100%
rename from docs/scrapper/how_to_use.md
rename to docs/scraper/how_to_use.md
diff --git a/docs/scrapper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py
similarity index 88%
rename from docs/scrapper/scraper_quickstart.py
rename to docs/scraper/scraper_quickstart.py
index a2c7a239..811f997e 100644
--- a/docs/scrapper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -7,6 +7,7 @@ from crawl4ai.scraper import (
     ContentTypeFilter
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler
+import re
 
 async def basic_scraper_example():
     """
@@ -18,7 +19,7 @@ async def basic_scraper_example():
     # Create a simple filter chain
     filter_chain = FilterChain([
         # Only crawl pages within the blog section
-        URLPatternFilter("*/blog/*"),
+        # URLPatternFilter("*/tutorial/*"),
         # Only process HTML pages
         ContentTypeFilter(["text/html"])
     ])
@@ -32,20 +33,19 @@ async def basic_scraper_example():
     )
 
     # Create the crawler and scraper
-    crawler = AsyncWebCrawler()
-    scraper = AsyncWebScraper(crawler, strategy)
-
-    # Start scraping
-    try:
-        result = await scraper.ascrape("https://example.com/blog/")
-        
-        # Process results
-        print(f"Crawled {len(result.crawled_urls)} pages:")
-        for url, data in result.extracted_data.items():
-            print(f"- {url}: {len(data.html)} bytes")
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        scraper = AsyncWebScraper(crawler, strategy)
+        # Start scraping
+        try:
+            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
             
-    except Exception as e:
-        print(f"Error during scraping: {e}")
+            # Process results
+            print(f"Crawled {len(result.crawled_urls)} pages:")
+            for url, data in result.extracted_data.items():
+                print(f"- {url}: {len(data.html)} bytes")
+            
+        except Exception as e:
+            print(f"Error during scraping: {e}")
 
 # advanced_scraper_example.py
 import logging
@@ -180,5 +180,5 @@ if __name__ == "__main__":
     print("Running basic scraper example...")
     asyncio.run(basic_scraper_example())
     
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
\ No newline at end of file
+    # print("\nRunning advanced scraper example...")
+    # asyncio.run(advanced_scraper_example())
\ No newline at end of file