feat: Add deep crawl capabilities to arun_many function
This commit is contained in:
@@ -798,6 +798,22 @@ class AsyncWebCrawler:
|
|||||||
):
|
):
|
||||||
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
async def merge_async_generators(generators):
|
||||||
|
tasks = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
|
||||||
|
while tasks:
|
||||||
|
done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
|
||||||
|
|
||||||
|
for task in done:
|
||||||
|
gen = tasks.pop(task) # Get the generator associated with this task
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = task.result()
|
||||||
|
yield result # Yield the result
|
||||||
|
tasks[asyncio.create_task(gen.__anext__())] = gen # Fetch next item
|
||||||
|
except StopAsyncIteration:
|
||||||
|
pass # Generator is exhausted, don't add it back to the tasks
|
||||||
|
|
||||||
if config is None:
|
if config is None:
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
word_count_threshold=word_count_threshold,
|
word_count_threshold=word_count_threshold,
|
||||||
@@ -838,6 +854,27 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
stream = config.stream
|
stream = config.stream
|
||||||
|
|
||||||
|
if config.deep_crawl_strategy:
|
||||||
|
if config.stream:
|
||||||
|
generators = []
|
||||||
|
for url in urls:
|
||||||
|
generators.append(
|
||||||
|
config.deep_crawl_strategy.arun(
|
||||||
|
start_url=url, crawler=self, crawler_run_config=config
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return merge_async_generators(generators)
|
||||||
|
else:
|
||||||
|
results = []
|
||||||
|
for url in urls:
|
||||||
|
url_results = []
|
||||||
|
async for result in config.deep_crawl_strategy.arun(
|
||||||
|
start_url=url, crawler=self, crawler_run_config=config
|
||||||
|
):
|
||||||
|
url_results.append(result)
|
||||||
|
results.append(url_results)
|
||||||
|
return results
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
|
||||||
async def result_transformer():
|
async def result_transformer():
|
||||||
|
|||||||
@@ -187,7 +187,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
|
|
||||||
finally:
|
finally:
|
||||||
self.stats.end_time = datetime.now()
|
self.stats.end_time = datetime.now()
|
||||||
await crawler.close()
|
|
||||||
|
|
||||||
async def shutdown(self):
|
async def shutdown(self):
|
||||||
"""Clean up resources and stop crawling"""
|
"""Clean up resources and stop crawling"""
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
# basic_scraper_example.py
|
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
||||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||||
from crawl4ai.deep_crawl import (
|
from crawl4ai.deep_crawl import (
|
||||||
@@ -20,9 +19,9 @@ import logging
|
|||||||
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
||||||
|
|
||||||
|
|
||||||
async def basic_scraper_example():
|
async def basic_example():
|
||||||
"""
|
"""
|
||||||
Basic example: Scrape a blog site for articles
|
Basic example: Deep crawl a blog site for articles
|
||||||
- Crawls only HTML pages
|
- Crawls only HTML pages
|
||||||
- Stays within the blog section
|
- Stays within the blog section
|
||||||
- Collects all results at once
|
- Collects all results at once
|
||||||
@@ -45,14 +44,15 @@ async def basic_scraper_example():
|
|||||||
process_external_links=True,
|
process_external_links=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create the crawler and scraper
|
# Create the crawler
|
||||||
async with AsyncWebCrawler(
|
async with AsyncWebCrawler(
|
||||||
config=browser_config,
|
config=browser_config,
|
||||||
) as crawler:
|
) as crawler:
|
||||||
# Start scraping
|
# Start scraping
|
||||||
try:
|
try:
|
||||||
results = await crawler.arun(
|
results = await crawler.arun(
|
||||||
"https://crawl4ai.com/mkdocs", CrawlerRunConfig(deep_crawl_strategy=bfs_strategy)
|
"https://crawl4ai.com/mkdocs",
|
||||||
|
CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
|
||||||
)
|
)
|
||||||
# Process results
|
# Process results
|
||||||
print(f"Crawled {len(results)} pages:")
|
print(f"Crawled {len(results)} pages:")
|
||||||
@@ -62,9 +62,10 @@ async def basic_scraper_example():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error during scraping: {e}")
|
print(f"Error during scraping: {e}")
|
||||||
|
|
||||||
async def advanced_scraper_example():
|
|
||||||
|
async def advanced_example():
|
||||||
"""
|
"""
|
||||||
Advanced example: Intelligent news site scraping
|
Advanced example: Intelligent news site crawling
|
||||||
- Uses all filter types
|
- Uses all filter types
|
||||||
- Implements sophisticated scoring
|
- Implements sophisticated scoring
|
||||||
- Streams results
|
- Streams results
|
||||||
@@ -72,7 +73,7 @@ async def advanced_scraper_example():
|
|||||||
"""
|
"""
|
||||||
# Set up logging
|
# Set up logging
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger("advanced_scraper")
|
logger = logging.getLogger("advanced_deep_crawler")
|
||||||
|
|
||||||
# Create sophisticated filter chain
|
# Create sophisticated filter chain
|
||||||
filter_chain = FilterChain(
|
filter_chain = FilterChain(
|
||||||
@@ -115,7 +116,7 @@ async def advanced_scraper_example():
|
|||||||
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create crawler and scraper
|
# Create crawler
|
||||||
async with AsyncWebCrawler(
|
async with AsyncWebCrawler(
|
||||||
config=browser_config,
|
config=browser_config,
|
||||||
) as crawler:
|
) as crawler:
|
||||||
@@ -128,8 +129,7 @@ async def advanced_scraper_example():
|
|||||||
results = []
|
results = []
|
||||||
result_generator = await crawler.arun(
|
result_generator = await crawler.arun(
|
||||||
"https://techcrunch.com",
|
"https://techcrunch.com",
|
||||||
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,
|
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
|
||||||
stream=True)
|
|
||||||
)
|
)
|
||||||
async for result in result_generator:
|
async for result in result_generator:
|
||||||
stats["processed"] += 1
|
stats["processed"] += 1
|
||||||
@@ -174,17 +174,87 @@ async def advanced_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def basic_example_many_urls():
|
||||||
|
filter_chain = FilterChain(
|
||||||
|
[
|
||||||
|
URLPatternFilter("*/basic/*"),
|
||||||
|
ContentTypeFilter(["text/html"]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Initialize the strategy with basic configuration
|
||||||
|
bfs_strategy = BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2, # Only go 2 levels deep
|
||||||
|
filter_chain=filter_chain,
|
||||||
|
url_scorer=None, # Use default scoring
|
||||||
|
process_external_links=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the crawler
|
||||||
|
async with AsyncWebCrawler(
|
||||||
|
config=browser_config,
|
||||||
|
) as crawler:
|
||||||
|
# Start scraping
|
||||||
|
try:
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
|
||||||
|
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
|
||||||
|
)
|
||||||
|
# Process results
|
||||||
|
print(f"Crawled {len(results)} pages:")
|
||||||
|
for url_result in results:
|
||||||
|
for result in url_result:
|
||||||
|
print(f"- {result.url}: {len(result.html)} bytes")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during scraping: {e}")
|
||||||
|
|
||||||
|
async def basic_example_many_urls_stream():
|
||||||
|
filter_chain = FilterChain(
|
||||||
|
[
|
||||||
|
URLPatternFilter("*/basic/*"),
|
||||||
|
ContentTypeFilter(["text/html"]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Initialize the strategy with basic configuration
|
||||||
|
bfs_strategy = BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2, # Only go 2 levels deep
|
||||||
|
filter_chain=filter_chain,
|
||||||
|
url_scorer=None, # Use default scoring
|
||||||
|
process_external_links=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the crawler
|
||||||
|
async with AsyncWebCrawler(
|
||||||
|
config=browser_config,
|
||||||
|
) as crawler:
|
||||||
|
# Start scraping
|
||||||
|
try:
|
||||||
|
async for result in await crawler.arun_many(
|
||||||
|
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
|
||||||
|
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
|
||||||
|
):
|
||||||
|
# Process results
|
||||||
|
print(f"- {result.url}: {len(result.html)} bytes")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during scraping: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# Run basic example
|
# Run basic example
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
print("Running basic scraper example...")
|
print("Running basic Deep crawl example...")
|
||||||
asyncio.run(basic_scraper_example())
|
asyncio.run(basic_example())
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
|
print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")
|
||||||
|
|
||||||
# # Run advanced example
|
# Run advanced example
|
||||||
print("\nRunning advanced scraper example...")
|
print("\nRunning advanced deep crawl example...")
|
||||||
asyncio.run(advanced_scraper_example())
|
asyncio.run(advanced_example())
|
||||||
|
|
||||||
|
print("\nRunning advanced deep crawl example with arun_many...")
|
||||||
|
asyncio.run(basic_example_many_urls())
|
||||||
|
|
||||||
|
print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
|
||||||
|
asyncio.run(basic_example_many_urls_stream())
|
||||||
|
|||||||
Reference in New Issue
Block a user