fixed the final scraper_quickstart.py example
This commit is contained in:
@@ -19,7 +19,7 @@ async def basic_scraper_example():
|
|||||||
# Create a simple filter chain
|
# Create a simple filter chain
|
||||||
filter_chain = FilterChain([
|
filter_chain = FilterChain([
|
||||||
# Only crawl pages within the blog section
|
# Only crawl pages within the blog section
|
||||||
# URLPatternFilter("*/tutorial/*"),
|
URLPatternFilter("*/tutorial/*"),
|
||||||
# Only process HTML pages
|
# Only process HTML pages
|
||||||
ContentTypeFilter(["text/html"])
|
ContentTypeFilter(["text/html"])
|
||||||
])
|
])
|
||||||
@@ -29,7 +29,8 @@ async def basic_scraper_example():
|
|||||||
max_depth=2, # Only go 2 levels deep
|
max_depth=2, # Only go 2 levels deep
|
||||||
filter_chain=filter_chain,
|
filter_chain=filter_chain,
|
||||||
url_scorer=None, # Use default scoring
|
url_scorer=None, # Use default scoring
|
||||||
max_concurrent=3 # Limit concurrent requests
|
max_concurrent=3, # Limit concurrent requests
|
||||||
|
process_external_links=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create the crawler and scraper
|
# Create the crawler and scraper
|
||||||
@@ -79,8 +80,8 @@ async def advanced_scraper_example():
|
|||||||
filter_chain = FilterChain([
|
filter_chain = FilterChain([
|
||||||
# Domain control
|
# Domain control
|
||||||
DomainFilter(
|
DomainFilter(
|
||||||
allowed_domains=["example.com", "blog.example.com"],
|
allowed_domains=["techcrunch.com"],
|
||||||
blocked_domains=["ads.example.com", "tracker.example.com"]
|
blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
|
||||||
),
|
),
|
||||||
# URL patterns
|
# URL patterns
|
||||||
URLPatternFilter([
|
URLPatternFilter([
|
||||||
@@ -114,7 +115,7 @@ async def advanced_scraper_example():
|
|||||||
|
|
||||||
# Initialize strategy with advanced configuration
|
# Initialize strategy with advanced configuration
|
||||||
strategy = BFSScraperStrategy(
|
strategy = BFSScraperStrategy(
|
||||||
max_depth=4,
|
max_depth=2,
|
||||||
filter_chain=filter_chain,
|
filter_chain=filter_chain,
|
||||||
url_scorer=scorer,
|
url_scorer=scorer,
|
||||||
max_concurrent=5,
|
max_concurrent=5,
|
||||||
@@ -122,63 +123,61 @@ async def advanced_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create crawler and scraper
|
# Create crawler and scraper
|
||||||
crawler = AsyncWebCrawler()
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
scraper = AsyncWebScraper(crawler, strategy)
|
||||||
|
|
||||||
# Track statistics
|
# Track statistics
|
||||||
stats = {
|
stats = {
|
||||||
'processed': 0,
|
'processed': 0,
|
||||||
'errors': 0,
|
'errors': 0,
|
||||||
'total_size': 0
|
'total_size': 0
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use streaming mode
|
# Use streaming mode
|
||||||
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
|
||||||
stats['processed'] += 1
|
async for result in result_generator:
|
||||||
|
stats['processed'] += 1
|
||||||
if result.success:
|
|
||||||
stats['total_size'] += len(result.html)
|
|
||||||
logger.info(f"Processed: {result.url}")
|
|
||||||
|
|
||||||
# Print scoring information
|
if result.success:
|
||||||
for scorer_name, score in result.scores.items():
|
stats['total_size'] += len(result.html)
|
||||||
logger.debug(f"{scorer_name}: {score:.2f}")
|
logger.info(f"Processed: {result.url}")
|
||||||
else:
|
else:
|
||||||
stats['errors'] += 1
|
stats['errors'] += 1
|
||||||
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
||||||
|
|
||||||
# Log progress regularly
|
# Log progress regularly
|
||||||
if stats['processed'] % 10 == 0:
|
if stats['processed'] % 10 == 0:
|
||||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
logger.info(f"Progress: {stats['processed']} URLs processed")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Scraping error: {e}")
|
logger.error(f"Scraping error: {e}")
|
||||||
|
|
||||||
finally:
|
|
||||||
# Print final statistics
|
|
||||||
logger.info("Scraping completed:")
|
|
||||||
logger.info(f"- URLs processed: {stats['processed']}")
|
|
||||||
logger.info(f"- Errors: {stats['errors']}")
|
|
||||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
|
||||||
|
|
||||||
# Print filter statistics
|
finally:
|
||||||
for filter_ in filter_chain.filters:
|
# Print final statistics
|
||||||
logger.info(f"{filter_.name} stats:")
|
logger.info("Scraping completed:")
|
||||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
logger.info(f"- URLs processed: {stats['processed']}")
|
||||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
logger.info(f"- Errors: {stats['errors']}")
|
||||||
|
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
||||||
# Print scorer statistics
|
|
||||||
logger.info("Scoring statistics:")
|
# Print filter statistics
|
||||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
for filter_ in filter_chain.filters:
|
||||||
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
logger.info(f"{filter_.name} stats:")
|
||||||
|
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
||||||
|
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
||||||
|
|
||||||
|
# Print scorer statistics
|
||||||
|
logger.info("Scoring statistics:")
|
||||||
|
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
||||||
|
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
# Run basic example
|
# Run basic example
|
||||||
print("Running basic scraper example...")
|
# print("Running basic scraper example...")
|
||||||
asyncio.run(basic_scraper_example())
|
# asyncio.run(basic_scraper_example())
|
||||||
|
|
||||||
# print("\nRunning advanced scraper example...")
|
# Run advanced example
|
||||||
# asyncio.run(advanced_scraper_example())
|
print("\nRunning advanced scraper example...")
|
||||||
|
asyncio.run(advanced_scraper_example())
|
||||||
Reference in New Issue
Block a user