refactor(scraper): optimize URL validation and filter performance

- Replace validators library with built-in urlparse for URL validation - Optimize filter statistics update logic for better performance - Add performance benchmarking suite for filters - Add execution time tracking to scraper examples - Update gitignore with windsurfrules BREAKING CHANGE: Removed dependency on validators library for URL validation
2025-01-22 19:45:56 +08:00
parent 6e78c56dda
commit e6ef8d91ba
6 changed files with 1140 additions and 13 deletions
--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -8,6 +8,7 @@ from crawl4ai.scraper import (
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
 import re
+import time

 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)

@@ -55,6 +56,7 @@ async def basic_scraper_example():

 # advanced_scraper_example.py
 import logging
+
 from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
@@ -177,11 +179,15 @@ async def advanced_scraper_example():

 if __name__ == "__main__":
    import asyncio
+    import time

    # Run basic example
+    start_time = time.perf_counter()
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())
+    end_time = time.perf_counter()
+    print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")

-    # Run advanced example
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # # Run advanced example
+    # print("\nRunning advanced scraper example...")
+    # asyncio.run(advanced_scraper_example())