refactor(scraper): optimize URL validation and filter performance
- Replace validators library with built-in urlparse for URL validation - Optimize filter statistics update logic for better performance - Add performance benchmarking suite for filters - Add execution time tracking to scraper examples - Update gitignore with windsurfrules BREAKING CHANGE: Removed dependency on validators library for URL validation
This commit is contained in:
@@ -8,6 +8,7 @@ from crawl4ai.scraper import (
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
|
||||
import re
|
||||
import time
|
||||
|
||||
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
||||
|
||||
@@ -55,6 +56,7 @@ async def basic_scraper_example():
|
||||
|
||||
# advanced_scraper_example.py
|
||||
import logging
|
||||
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
@@ -177,11 +179,15 @@ async def advanced_scraper_example():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
# Run basic example
|
||||
start_time = time.perf_counter()
|
||||
print("Running basic scraper example...")
|
||||
asyncio.run(basic_scraper_example())
|
||||
end_time = time.perf_counter()
|
||||
print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
|
||||
|
||||
# Run advanced example
|
||||
print("\nRunning advanced scraper example...")
|
||||
asyncio.run(advanced_scraper_example())
|
||||
# # Run advanced example
|
||||
# print("\nRunning advanced scraper example...")
|
||||
# asyncio.run(advanced_scraper_example())
|
||||
|
||||
Reference in New Issue
Block a user