fixed the final scraper_quickstart.py example

This commit is contained in:
Aravind Karnam
2024-11-26 17:05:54 +05:30
parent 155c756238
commit 9530ded83a

View File

@@ -19,7 +19,7 @@ async def basic_scraper_example():
# Create a simple filter chain # Create a simple filter chain
filter_chain = FilterChain([ filter_chain = FilterChain([
# Only crawl pages within the blog section # Only crawl pages within the blog section
# URLPatternFilter("*/tutorial/*"), URLPatternFilter("*/tutorial/*"),
# Only process HTML pages # Only process HTML pages
ContentTypeFilter(["text/html"]) ContentTypeFilter(["text/html"])
]) ])
@@ -29,7 +29,8 @@ async def basic_scraper_example():
max_depth=2, # Only go 2 levels deep max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain, filter_chain=filter_chain,
url_scorer=None, # Use default scoring url_scorer=None, # Use default scoring
max_concurrent=3 # Limit concurrent requests max_concurrent=3, # Limit concurrent requests
process_external_links=True
) )
# Create the crawler and scraper # Create the crawler and scraper
@@ -79,8 +80,8 @@ async def advanced_scraper_example():
filter_chain = FilterChain([ filter_chain = FilterChain([
# Domain control # Domain control
DomainFilter( DomainFilter(
allowed_domains=["example.com", "blog.example.com"], allowed_domains=["techcrunch.com"],
blocked_domains=["ads.example.com", "tracker.example.com"] blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
), ),
# URL patterns # URL patterns
URLPatternFilter([ URLPatternFilter([
@@ -114,7 +115,7 @@ async def advanced_scraper_example():
# Initialize strategy with advanced configuration # Initialize strategy with advanced configuration
strategy = BFSScraperStrategy( strategy = BFSScraperStrategy(
max_depth=4, max_depth=2,
filter_chain=filter_chain, filter_chain=filter_chain,
url_scorer=scorer, url_scorer=scorer,
max_concurrent=5, max_concurrent=5,
@@ -122,63 +123,61 @@ async def advanced_scraper_example():
) )
# Create crawler and scraper # Create crawler and scraper
crawler = AsyncWebCrawler() async with AsyncWebCrawler(verbose=True) as crawler:
scraper = AsyncWebScraper(crawler, strategy) scraper = AsyncWebScraper(crawler, strategy)
# Track statistics # Track statistics
stats = { stats = {
'processed': 0, 'processed': 0,
'errors': 0, 'errors': 0,
'total_size': 0 'total_size': 0
} }
try: try:
# Use streaming mode # Use streaming mode
async for result in scraper.ascrape("https://example.com/news/", stream=True): result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
stats['processed'] += 1 async for result in result_generator:
stats['processed'] += 1
if result.success:
stats['total_size'] += len(result.html)
logger.info(f"Processed: {result.url}")
# Print scoring information if result.success:
for scorer_name, score in result.scores.items(): stats['total_size'] += len(result.html)
logger.debug(f"{scorer_name}: {score:.2f}") logger.info(f"Processed: {result.url}")
else: else:
stats['errors'] += 1 stats['errors'] += 1
logger.error(f"Failed to process {result.url}: {result.error_message}") logger.error(f"Failed to process {result.url}: {result.error_message}")
# Log progress regularly # Log progress regularly
if stats['processed'] % 10 == 0: if stats['processed'] % 10 == 0:
logger.info(f"Progress: {stats['processed']} URLs processed") logger.info(f"Progress: {stats['processed']} URLs processed")
except Exception as e: except Exception as e:
logger.error(f"Scraping error: {e}") logger.error(f"Scraping error: {e}")
finally:
# Print final statistics
logger.info("Scraping completed:")
logger.info(f"- URLs processed: {stats['processed']}")
logger.info(f"- Errors: {stats['errors']}")
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
# Print filter statistics finally:
for filter_ in filter_chain.filters: # Print final statistics
logger.info(f"{filter_.name} stats:") logger.info("Scraping completed:")
logger.info(f"- Passed: {filter_.stats.passed_urls}") logger.info(f"- URLs processed: {stats['processed']}")
logger.info(f"- Rejected: {filter_.stats.rejected_urls}") logger.info(f"- Errors: {stats['errors']}")
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
# Print scorer statistics
logger.info("Scoring statistics:") # Print filter statistics
logger.info(f"- Average score: {scorer.stats.average_score:.2f}") for filter_ in filter_chain.filters:
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") logger.info(f"{filter_.name} stats:")
logger.info(f"- Passed: {filter_.stats.passed_urls}")
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
# Print scorer statistics
logger.info("Scoring statistics:")
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
if __name__ == "__main__": if __name__ == "__main__":
import asyncio import asyncio
# Run basic example # Run basic example
print("Running basic scraper example...") # print("Running basic scraper example...")
asyncio.run(basic_scraper_example()) # asyncio.run(basic_scraper_example())
# print("\nRunning advanced scraper example...") # Run advanced example
# asyncio.run(advanced_scraper_example()) print("\nRunning advanced scraper example...")
asyncio.run(advanced_scraper_example())