Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.

2024-11-23 13:52:34 +05:30
parent c1797037c0
commit f8e85b1499
6 changed files with 35 additions and 31 deletions
--- a/docs/scrapper/async_web_scraper.md
+++ b/docs/scrapper/async_web_scraper.md
--- a/docs/scrapper/bfs_scraper_strategy.md
+++ b/docs/scrapper/bfs_scraper_strategy.md
--- a/docs/scrapper/filters_scrorers.md
+++ b/docs/scrapper/filters_scrorers.md
--- a/docs/scrapper/how_to_use.md
+++ b/docs/scrapper/how_to_use.md
--- a/docs/scrapper/scraper_quickstart.py
+++ b/docs/scrapper/scraper_quickstart.py
@@ -7,6 +7,7 @@ from crawl4ai.scraper import (
    ContentTypeFilter
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler
+import re

 async def basic_scraper_example():
    """
@@ -18,7 +19,7 @@ async def basic_scraper_example():
    # Create a simple filter chain
    filter_chain = FilterChain([
        # Only crawl pages within the blog section
-        URLPatternFilter("*/blog/*"),
+        # URLPatternFilter("*/tutorial/*"),
        # Only process HTML pages
        ContentTypeFilter(["text/html"])
    ])
@@ -32,20 +33,19 @@ async def basic_scraper_example():
    )

    # Create the crawler and scraper
-    crawler = AsyncWebCrawler()
-    scraper = AsyncWebScraper(crawler, strategy)
-
-    # Start scraping
-    try:
-        result = await scraper.ascrape("https://example.com/blog/")
-        
-        # Process results
-        print(f"Crawled {len(result.crawled_urls)} pages:")
-        for url, data in result.extracted_data.items():
-            print(f"- {url}: {len(data.html)} bytes")
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        scraper = AsyncWebScraper(crawler, strategy)
+        # Start scraping
+        try:
+            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
            
-    except Exception as e:
-        print(f"Error during scraping: {e}")
+            # Process results
+            print(f"Crawled {len(result.crawled_urls)} pages:")
+            for url, data in result.extracted_data.items():
+                print(f"- {url}: {len(data.html)} bytes")
+            
+        except Exception as e:
+            print(f"Error during scraping: {e}")

 # advanced_scraper_example.py
 import logging
@@ -180,5 +180,5 @@ if __name__ == "__main__":
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())
    
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # print("\nRunning advanced scraper example...")
+    # asyncio.run(advanced_scraper_example())