feat: Add deep crawl capabilities to arun_many function

This commit is contained in:
Aravind Karnam
2025-01-30 17:49:58 +05:30
parent f6edb8342e
commit f7ce2d42c9
3 changed files with 124 additions and 18 deletions

View File

@@ -798,6 +798,22 @@ class AsyncWebCrawler:
):
print(f"Processed {result.url}: {len(result.markdown)} chars")
"""
async def merge_async_generators(generators):
tasks = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
while tasks:
done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
gen = tasks.pop(task) # Get the generator associated with this task
try:
result = task.result()
yield result # Yield the result
tasks[asyncio.create_task(gen.__anext__())] = gen # Fetch next item
except StopAsyncIteration:
pass # Generator is exhausted, don't add it back to the tasks
if config is None:
config = CrawlerRunConfig(
word_count_threshold=word_count_threshold,
@@ -838,6 +854,27 @@ class AsyncWebCrawler:
stream = config.stream
if config.deep_crawl_strategy:
if config.stream:
generators = []
for url in urls:
generators.append(
config.deep_crawl_strategy.arun(
start_url=url, crawler=self, crawler_run_config=config
)
)
return merge_async_generators(generators)
else:
results = []
for url in urls:
url_results = []
async for result in config.deep_crawl_strategy.arun(
start_url=url, crawler=self, crawler_run_config=config
):
url_results.append(result)
results.append(url_results)
return results
if stream:
async def result_transformer():

View File

@@ -187,7 +187,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
finally:
self.stats.end_time = datetime.now()
await crawler.close()
async def shutdown(self):
"""Clean up resources and stop crawling"""

View File

@@ -1,4 +1,3 @@
# basic_scraper_example.py
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawl import (
@@ -20,9 +19,9 @@ import logging
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
async def basic_scraper_example():
async def basic_example():
"""
Basic example: Scrape a blog site for articles
Basic example: Deep crawl a blog site for articles
- Crawls only HTML pages
- Stays within the blog section
- Collects all results at once
@@ -45,14 +44,15 @@ async def basic_scraper_example():
process_external_links=True,
)
# Create the crawler and scraper
# Create the crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
# Start scraping
try:
results = await crawler.arun(
"https://crawl4ai.com/mkdocs", CrawlerRunConfig(deep_crawl_strategy=bfs_strategy)
"https://crawl4ai.com/mkdocs",
CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
)
# Process results
print(f"Crawled {len(results)} pages:")
@@ -62,9 +62,10 @@ async def basic_scraper_example():
except Exception as e:
print(f"Error during scraping: {e}")
async def advanced_scraper_example():
async def advanced_example():
"""
Advanced example: Intelligent news site scraping
Advanced example: Intelligent news site crawling
- Uses all filter types
- Implements sophisticated scoring
- Streams results
@@ -72,7 +73,7 @@ async def advanced_scraper_example():
"""
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("advanced_scraper")
logger = logging.getLogger("advanced_deep_crawler")
# Create sophisticated filter chain
filter_chain = FilterChain(
@@ -115,7 +116,7 @@ async def advanced_scraper_example():
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
)
# Create crawler and scraper
# Create crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
@@ -128,8 +129,7 @@ async def advanced_scraper_example():
results = []
result_generator = await crawler.arun(
"https://techcrunch.com",
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,
stream=True)
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
)
async for result in result_generator:
stats["processed"] += 1
@@ -174,17 +174,87 @@ async def advanced_scraper_example():
)
async def basic_example_many_urls():
filter_chain = FilterChain(
[
URLPatternFilter("*/basic/*"),
ContentTypeFilter(["text/html"]),
]
)
# Initialize the strategy with basic configuration
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
process_external_links=False,
)
# Create the crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
# Start scraping
try:
results = await crawler.arun_many(
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
)
# Process results
print(f"Crawled {len(results)} pages:")
for url_result in results:
for result in url_result:
print(f"- {result.url}: {len(result.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
async def basic_example_many_urls_stream():
filter_chain = FilterChain(
[
URLPatternFilter("*/basic/*"),
ContentTypeFilter(["text/html"]),
]
)
# Initialize the strategy with basic configuration
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
process_external_links=False,
)
# Create the crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
# Start scraping
try:
async for result in await crawler.arun_many(
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
):
# Process results
print(f"- {result.url}: {len(result.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
if __name__ == "__main__":
import asyncio
import time
# Run basic example
start_time = time.perf_counter()
print("Running basic scraper example...")
asyncio.run(basic_scraper_example())
print("Running basic Deep crawl example...")
asyncio.run(basic_example())
end_time = time.perf_counter()
print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")
# # Run advanced example
print("\nRunning advanced scraper example...")
asyncio.run(advanced_scraper_example())
# Run advanced example
print("\nRunning advanced deep crawl example...")
asyncio.run(advanced_example())
print("\nRunning advanced deep crawl example with arun_many...")
asyncio.run(basic_example_many_urls())
print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
asyncio.run(basic_example_many_urls_stream())