fixed the final scraper_quickstart.py example
This commit is contained in:
@@ -19,7 +19,7 @@ async def basic_scraper_example():
|
|||||||
# Create a simple filter chain
|
# Create a simple filter chain
|
||||||
filter_chain = FilterChain([
|
filter_chain = FilterChain([
|
||||||
# Only crawl pages within the blog section
|
# Only crawl pages within the blog section
|
||||||
# URLPatternFilter("*/tutorial/*"),
|
URLPatternFilter("*/tutorial/*"),
|
||||||
# Only process HTML pages
|
# Only process HTML pages
|
||||||
ContentTypeFilter(["text/html"])
|
ContentTypeFilter(["text/html"])
|
||||||
])
|
])
|
||||||
@@ -29,7 +29,8 @@ async def basic_scraper_example():
|
|||||||
max_depth=2, # Only go 2 levels deep
|
max_depth=2, # Only go 2 levels deep
|
||||||
filter_chain=filter_chain,
|
filter_chain=filter_chain,
|
||||||
url_scorer=None, # Use default scoring
|
url_scorer=None, # Use default scoring
|
||||||
max_concurrent=3 # Limit concurrent requests
|
max_concurrent=3, # Limit concurrent requests
|
||||||
|
process_external_links=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create the crawler and scraper
|
# Create the crawler and scraper
|
||||||
@@ -79,8 +80,8 @@ async def advanced_scraper_example():
|
|||||||
filter_chain = FilterChain([
|
filter_chain = FilterChain([
|
||||||
# Domain control
|
# Domain control
|
||||||
DomainFilter(
|
DomainFilter(
|
||||||
allowed_domains=["example.com", "blog.example.com"],
|
allowed_domains=["techcrunch.com"],
|
||||||
blocked_domains=["ads.example.com", "tracker.example.com"]
|
blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
|
||||||
),
|
),
|
||||||
# URL patterns
|
# URL patterns
|
||||||
URLPatternFilter([
|
URLPatternFilter([
|
||||||
@@ -114,7 +115,7 @@ async def advanced_scraper_example():
|
|||||||
|
|
||||||
# Initialize strategy with advanced configuration
|
# Initialize strategy with advanced configuration
|
||||||
strategy = BFSScraperStrategy(
|
strategy = BFSScraperStrategy(
|
||||||
max_depth=4,
|
max_depth=2,
|
||||||
filter_chain=filter_chain,
|
filter_chain=filter_chain,
|
||||||
url_scorer=scorer,
|
url_scorer=scorer,
|
||||||
max_concurrent=5,
|
max_concurrent=5,
|
||||||
@@ -122,7 +123,7 @@ async def advanced_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create crawler and scraper
|
# Create crawler and scraper
|
||||||
crawler = AsyncWebCrawler()
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
scraper = AsyncWebScraper(crawler, strategy)
|
||||||
|
|
||||||
# Track statistics
|
# Track statistics
|
||||||
@@ -134,16 +135,13 @@ async def advanced_scraper_example():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Use streaming mode
|
# Use streaming mode
|
||||||
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
|
||||||
|
async for result in result_generator:
|
||||||
stats['processed'] += 1
|
stats['processed'] += 1
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
stats['total_size'] += len(result.html)
|
stats['total_size'] += len(result.html)
|
||||||
logger.info(f"Processed: {result.url}")
|
logger.info(f"Processed: {result.url}")
|
||||||
|
|
||||||
# Print scoring information
|
|
||||||
for scorer_name, score in result.scores.items():
|
|
||||||
logger.debug(f"{scorer_name}: {score:.2f}")
|
|
||||||
else:
|
else:
|
||||||
stats['errors'] += 1
|
stats['errors'] += 1
|
||||||
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
||||||
@@ -177,8 +175,9 @@ if __name__ == "__main__":
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
# Run basic example
|
# Run basic example
|
||||||
print("Running basic scraper example...")
|
# print("Running basic scraper example...")
|
||||||
asyncio.run(basic_scraper_example())
|
# asyncio.run(basic_scraper_example())
|
||||||
|
|
||||||
# print("\nRunning advanced scraper example...")
|
# Run advanced example
|
||||||
# asyncio.run(advanced_scraper_example())
|
print("\nRunning advanced scraper example...")
|
||||||
|
asyncio.run(advanced_scraper_example())
|
||||||
Reference in New Issue
Block a user