Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.

2024-11-23 13:52:34 +05:30
parent c1797037c0
commit f8e85b1499
6 changed files with 35 additions and 31 deletions
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
        links_ro_process = result.links["internal"]
        if self.process_external_links:
            links_ro_process += result.links["external"]
-        for link_type in links_ro_process:
-            for link in result.links[link_type]:
-                url = link['href']
-                # url = urljoin(source_url, link['href'])
-                # url = urlunparse(urlparse(url)._replace(fragment=""))
-                
-                if url not in visited and await self.can_process_url(url):
-                    new_depth = depths[source_url] + 1
-                    if new_depth <= self.max_depth:
+        for link in links_ro_process:
+            url = link['href']
+            # url = urljoin(source_url, link['href'])
+            # url = urlunparse(urlparse(url)._replace(fragment=""))
+            
+            if url not in visited and await self.can_process_url(url):
+                new_depth = depths[source_url] + 1
+                if new_depth <= self.max_depth:
+                    if self.url_scorer:
                        score = self.url_scorer.score(url)
-                        await queue.put((score, new_depth, url))
-                        depths[url] = new_depth
-                        self.stats.total_depth_reached = max(
-                            self.stats.total_depth_reached, 
-                            new_depth
-                        )
+                    else:
+                        # When no url_scorer is provided all urls will have same score of 0.
+                        # Therefore will be process in FIFO order as per URL depth
+                        score = 0
+                    await queue.put((score, new_depth, url))
+                    depths[url] = new_depth
+                    self.stats.total_depth_reached = max(
+                        self.stats.total_depth_reached, 
+                        new_depth
+                    )

    async def ascrape(
        self,
--- a/docs/scrapper/async_web_scraper.md
+++ b/docs/scrapper/async_web_scraper.md
--- a/docs/scrapper/bfs_scraper_strategy.md
+++ b/docs/scrapper/bfs_scraper_strategy.md
--- a/docs/scrapper/filters_scrorers.md
+++ b/docs/scrapper/filters_scrorers.md
--- a/docs/scrapper/how_to_use.md
+++ b/docs/scrapper/how_to_use.md
--- a/docs/scrapper/scraper_quickstart.py
+++ b/docs/scrapper/scraper_quickstart.py
@@ -7,6 +7,7 @@ from crawl4ai.scraper import (
    ContentTypeFilter
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler
+import re

 async def basic_scraper_example():
    """
@@ -18,7 +19,7 @@ async def basic_scraper_example():
    # Create a simple filter chain
    filter_chain = FilterChain([
        # Only crawl pages within the blog section
-        URLPatternFilter("*/blog/*"),
+        # URLPatternFilter("*/tutorial/*"),
        # Only process HTML pages
        ContentTypeFilter(["text/html"])
    ])
@@ -32,20 +33,19 @@ async def basic_scraper_example():
    )

    # Create the crawler and scraper
-    crawler = AsyncWebCrawler()
-    scraper = AsyncWebScraper(crawler, strategy)
-
-    # Start scraping
-    try:
-        result = await scraper.ascrape("https://example.com/blog/")
-        
-        # Process results
-        print(f"Crawled {len(result.crawled_urls)} pages:")
-        for url, data in result.extracted_data.items():
-            print(f"- {url}: {len(data.html)} bytes")
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        scraper = AsyncWebScraper(crawler, strategy)
+        # Start scraping
+        try:
+            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
            
-    except Exception as e:
-        print(f"Error during scraping: {e}")
+            # Process results
+            print(f"Crawled {len(result.crawled_urls)} pages:")
+            for url, data in result.extracted_data.items():
+                print(f"- {url}: {len(data.html)} bytes")
+            
+        except Exception as e:
+            print(f"Error during scraping: {e}")

 # advanced_scraper_example.py
 import logging
@@ -180,5 +180,5 @@ if __name__ == "__main__":
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())
    
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # print("\nRunning advanced scraper example...")
+    # asyncio.run(advanced_scraper_example())