refactor(scraper): optimize URL validation and filter performance

- Replace validators library with built-in urlparse for URL validation
- Optimize filter statistics update logic for better performance
- Add performance benchmarking suite for filters
- Add execution time tracking to scraper examples
- Update gitignore with windsurfrules

BREAKING CHANGE: Removed dependency on validators library for URL validation
This commit is contained in:
UncleCode
2025-01-22 19:45:56 +08:00
parent 6e78c56dda
commit e6ef8d91ba
6 changed files with 1140 additions and 13 deletions

View File

@@ -8,6 +8,7 @@ from crawl4ai.scraper import (
)
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
import re
import time
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
@@ -55,6 +56,7 @@ async def basic_scraper_example():
# advanced_scraper_example.py
import logging
from crawl4ai.scraper import (
AsyncWebScraper,
BFSScraperStrategy,
@@ -177,11 +179,15 @@ async def advanced_scraper_example():
if __name__ == "__main__":
import asyncio
import time
# Run basic example
start_time = time.perf_counter()
print("Running basic scraper example...")
asyncio.run(basic_scraper_example())
end_time = time.perf_counter()
print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
# Run advanced example
print("\nRunning advanced scraper example...")
asyncio.run(advanced_scraper_example())
# # Run advanced example
# print("\nRunning advanced scraper example...")
# asyncio.run(advanced_scraper_example())