- Set wait_for_images default to false for better performance - Simplify response attribute copying in AsyncWebCrawler - Update hello_world example with proper content filtering
834 lines
35 KiB
Python
834 lines
35 KiB
Python
import os, sys
|
|
import time
|
|
import warnings
|
|
from enum import Enum
|
|
from colorama import init, Fore, Back, Style
|
|
from pathlib import Path
|
|
from typing import Optional, List, Union
|
|
import json
|
|
import asyncio
|
|
# from contextlib import nullcontext, asynccontextmanager
|
|
from contextlib import asynccontextmanager
|
|
from .models import CrawlResult, MarkdownGenerationResult
|
|
from .async_database import async_db_manager
|
|
from .chunking_strategy import *
|
|
from .content_filter_strategy import *
|
|
from .extraction_strategy import *
|
|
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
|
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
|
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
|
|
from .content_scraping_strategy import WebScrapingStrategy
|
|
from .async_logger import AsyncLogger
|
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
|
from .config import (
|
|
MIN_WORD_THRESHOLD,
|
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
|
URL_LOG_SHORTEN_LENGTH
|
|
)
|
|
from .utils import (
|
|
sanitize_input_encode,
|
|
InvalidCSSSelectorError,
|
|
format_html,
|
|
fast_format_html,
|
|
create_box_message
|
|
)
|
|
|
|
from urllib.parse import urlparse
|
|
import random
|
|
from .__version__ import __version__ as crawl4ai_version
|
|
|
|
|
|
class AsyncWebCrawler:
|
|
"""
|
|
Asynchronous web crawler with flexible caching capabilities.
|
|
|
|
There are two ways to use the crawler:
|
|
|
|
1. Using context manager (recommended for simple cases):
|
|
```python
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
```
|
|
|
|
2. Using explicit lifecycle management (recommended for long-running applications):
|
|
```python
|
|
crawler = AsyncWebCrawler()
|
|
await crawler.start()
|
|
|
|
# Use the crawler multiple times
|
|
result1 = await crawler.arun(url="https://example.com")
|
|
result2 = await crawler.arun(url="https://another.com")
|
|
|
|
await crawler.close()
|
|
```
|
|
|
|
Migration Guide:
|
|
Old way (deprecated):
|
|
crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
|
|
|
|
New way (recommended):
|
|
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
|
crawler = AsyncWebCrawler(config=browser_config)
|
|
|
|
|
|
Attributes:
|
|
browser_config (BrowserConfig): Configuration object for browser settings.
|
|
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
|
logger (AsyncLogger): Logger instance for recording events and errors.
|
|
always_bypass_cache (bool): Whether to always bypass cache.
|
|
crawl4ai_folder (str): Directory for storing cache.
|
|
base_directory (str): Base directory for storing cache.
|
|
ready (bool): Whether the crawler is ready for use.
|
|
|
|
Methods:
|
|
start(): Start the crawler explicitly without using context manager.
|
|
close(): Close the crawler explicitly without using context manager.
|
|
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
|
awarmup(): Perform warmup sequence.
|
|
arun_many(): Run the crawler for multiple sources.
|
|
aprocess_html(): Process HTML content.
|
|
|
|
Typical Usage:
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(url="https://example.com")
|
|
print(result.markdown)
|
|
|
|
Using configuration:
|
|
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
result = await crawler.arun(url="https://example.com", config=crawler_config)
|
|
print(result.markdown)
|
|
"""
|
|
_domain_last_hit = {}
|
|
|
|
def __init__(
|
|
self,
|
|
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
|
config: Optional[BrowserConfig] = None,
|
|
always_bypass_cache: bool = False,
|
|
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
|
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
|
thread_safe: bool = False,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
Initialize the AsyncWebCrawler.
|
|
|
|
Args:
|
|
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
|
|
config: Configuration object for browser settings. If None, will be created from kwargs
|
|
always_bypass_cache: Whether to always bypass cache (new parameter)
|
|
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
|
base_directory: Base directory for storing cache
|
|
thread_safe: Whether to use thread-safe operations
|
|
**kwargs: Additional arguments for backwards compatibility
|
|
"""
|
|
# Handle browser configuration
|
|
browser_config = config
|
|
if browser_config is not None:
|
|
if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]):
|
|
self.logger.warning(
|
|
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
|
|
tag="WARNING"
|
|
)
|
|
else:
|
|
# Create browser config from kwargs for backwards compatibility
|
|
browser_config = BrowserConfig.from_kwargs(kwargs)
|
|
|
|
self.browser_config = browser_config
|
|
|
|
# Initialize logger first since other components may need it
|
|
self.logger = AsyncLogger(
|
|
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
|
verbose=self.browser_config.verbose,
|
|
tag_width=10
|
|
)
|
|
|
|
|
|
# Initialize crawler strategy
|
|
params = {
|
|
k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger']
|
|
}
|
|
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
|
browser_config=browser_config,
|
|
logger=self.logger,
|
|
**params # Pass remaining kwargs for backwards compatibility
|
|
)
|
|
|
|
# If craweler strategy doesnt have logger, use crawler logger
|
|
if not self.crawler_strategy.logger:
|
|
self.crawler_strategy.logger = self.logger
|
|
|
|
# Handle deprecated cache parameter
|
|
if always_by_pass_cache is not None:
|
|
if kwargs.get("warning", True):
|
|
warnings.warn(
|
|
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
|
|
"Use 'always_bypass_cache' instead. "
|
|
"Pass warning=False to suppress this warning.",
|
|
DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
self.always_bypass_cache = always_by_pass_cache
|
|
else:
|
|
self.always_bypass_cache = always_bypass_cache
|
|
|
|
# Thread safety setup
|
|
self._lock = asyncio.Lock() if thread_safe else None
|
|
|
|
# Initialize directories
|
|
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
|
|
|
self.ready = False
|
|
|
|
async def start(self):
|
|
"""
|
|
Start the crawler explicitly without using context manager.
|
|
This is equivalent to using 'async with' but gives more control over the lifecycle.
|
|
|
|
This method will:
|
|
1. Initialize the browser and context
|
|
2. Perform warmup sequence
|
|
3. Return the crawler instance for method chaining
|
|
|
|
Returns:
|
|
AsyncWebCrawler: The initialized crawler instance
|
|
"""
|
|
await self.crawler_strategy.__aenter__()
|
|
await self.awarmup()
|
|
return self
|
|
|
|
async def close(self):
|
|
"""
|
|
Close the crawler explicitly without using context manager.
|
|
This should be called when you're done with the crawler if you used start().
|
|
|
|
This method will:
|
|
1. Clean up browser resources
|
|
2. Close any open pages and contexts
|
|
"""
|
|
await self.crawler_strategy.__aexit__(None, None, None)
|
|
|
|
async def __aenter__(self):
|
|
return await self.start()
|
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
await self.close()
|
|
|
|
async def awarmup(self):
|
|
"""
|
|
Initialize the crawler with warm-up sequence.
|
|
|
|
This method:
|
|
1. Logs initialization info
|
|
2. Sets up browser configuration
|
|
3. Marks the crawler as ready
|
|
"""
|
|
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
|
self.ready = True
|
|
|
|
@asynccontextmanager
|
|
async def nullcontext(self):
|
|
"""异步空上下文管理器"""
|
|
yield
|
|
|
|
async def arun(
|
|
self,
|
|
url: str,
|
|
config: Optional[CrawlerRunConfig] = None,
|
|
# Legacy parameters maintained for backwards compatibility
|
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
|
extraction_strategy: ExtractionStrategy = None,
|
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
content_filter: RelevantContentFilter = None,
|
|
cache_mode: Optional[CacheMode] = None,
|
|
# Deprecated cache parameters
|
|
bypass_cache: bool = False,
|
|
disable_cache: bool = False,
|
|
no_cache_read: bool = False,
|
|
no_cache_write: bool = False,
|
|
# Other legacy parameters
|
|
css_selector: str = None,
|
|
screenshot: bool = False,
|
|
pdf: bool = False,
|
|
user_agent: str = None,
|
|
verbose=True,
|
|
**kwargs,
|
|
) -> CrawlResult:
|
|
"""
|
|
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
|
|
|
Migration Guide:
|
|
Old way (deprecated):
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
word_count_threshold=200,
|
|
screenshot=True,
|
|
...
|
|
)
|
|
|
|
New way (recommended):
|
|
config = CrawlerRunConfig(
|
|
word_count_threshold=200,
|
|
screenshot=True,
|
|
...
|
|
)
|
|
result = await crawler.arun(url="https://example.com", crawler_config=config)
|
|
|
|
Args:
|
|
url: The URL to crawl (http://, https://, file://, or raw:)
|
|
crawler_config: Configuration object controlling crawl behavior
|
|
[other parameters maintained for backwards compatibility]
|
|
|
|
Returns:
|
|
CrawlResult: The result of crawling and processing
|
|
"""
|
|
crawler_config = config
|
|
if not isinstance(url, str) or not url:
|
|
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
|
|
|
async with self._lock or self.nullcontext():
|
|
try:
|
|
# Handle configuration
|
|
if crawler_config is not None:
|
|
# if any(param is not None for param in [
|
|
# word_count_threshold, extraction_strategy, chunking_strategy,
|
|
# content_filter, cache_mode, css_selector, screenshot, pdf
|
|
# ]):
|
|
# self.logger.warning(
|
|
# message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
|
# tag="WARNING"
|
|
# )
|
|
config = crawler_config
|
|
else:
|
|
# Merge all parameters into a single kwargs dict for config creation
|
|
config_kwargs = {
|
|
"word_count_threshold": word_count_threshold,
|
|
"extraction_strategy": extraction_strategy,
|
|
"chunking_strategy": chunking_strategy,
|
|
"content_filter": content_filter,
|
|
"cache_mode": cache_mode,
|
|
"bypass_cache": bypass_cache,
|
|
"disable_cache": disable_cache,
|
|
"no_cache_read": no_cache_read,
|
|
"no_cache_write": no_cache_write,
|
|
"css_selector": css_selector,
|
|
"screenshot": screenshot,
|
|
"pdf": pdf,
|
|
"verbose": verbose,
|
|
**kwargs
|
|
}
|
|
config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
|
|
|
# Handle deprecated cache parameters
|
|
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
|
if kwargs.get("warning", True):
|
|
warnings.warn(
|
|
"Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
|
|
"Use 'cache_mode' parameter instead.",
|
|
DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
|
|
# Convert legacy parameters if cache_mode not provided
|
|
if config.cache_mode is None:
|
|
config.cache_mode = _legacy_to_cache_mode(
|
|
disable_cache=disable_cache,
|
|
bypass_cache=bypass_cache,
|
|
no_cache_read=no_cache_read,
|
|
no_cache_write=no_cache_write
|
|
)
|
|
|
|
# Default to ENABLED if no cache mode specified
|
|
if config.cache_mode is None:
|
|
config.cache_mode = CacheMode.ENABLED
|
|
|
|
# Create cache context
|
|
cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache)
|
|
|
|
# Initialize processing variables
|
|
async_response: AsyncCrawlResponse = None
|
|
cached_result: CrawlResult = None
|
|
screenshot_data = None
|
|
pdf_data = None
|
|
extracted_content = None
|
|
start_time = time.perf_counter()
|
|
|
|
# Try to get cached result if appropriate
|
|
if cache_context.should_read():
|
|
cached_result = await async_db_manager.aget_cached_url(url)
|
|
|
|
if cached_result:
|
|
html = sanitize_input_encode(cached_result.html)
|
|
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
|
extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
|
|
# If screenshot is requested but its not in cache, then set cache_result to None
|
|
screenshot_data = cached_result.screenshot
|
|
pdf_data = cached_result.pdf
|
|
if config.screenshot and not screenshot or config.pdf and not pdf:
|
|
cached_result = None
|
|
|
|
self.logger.url_status(
|
|
url=cache_context.display_url,
|
|
success=bool(html),
|
|
timing=time.perf_counter() - start_time,
|
|
tag="FETCH"
|
|
)
|
|
|
|
# Fetch fresh content if needed
|
|
if not cached_result or not html:
|
|
t1 = time.perf_counter()
|
|
|
|
if user_agent:
|
|
self.crawler_strategy.update_user_agent(user_agent)
|
|
|
|
# Pass config to crawl method
|
|
async_response = await self.crawler_strategy.crawl(
|
|
url,
|
|
config=config # Pass the entire config object
|
|
)
|
|
|
|
html = sanitize_input_encode(async_response.html)
|
|
screenshot_data = async_response.screenshot
|
|
pdf_data = async_response.pdf_data
|
|
|
|
t2 = time.perf_counter()
|
|
self.logger.url_status(
|
|
url=cache_context.display_url,
|
|
success=bool(html),
|
|
timing=t2 - t1,
|
|
tag="FETCH"
|
|
)
|
|
|
|
# Process the HTML content
|
|
crawl_result = await self.aprocess_html(
|
|
url=url,
|
|
html=html,
|
|
extracted_content=extracted_content,
|
|
config=config, # Pass the config object instead of individual parameters
|
|
screenshot=screenshot_data,
|
|
pdf_data=pdf_data,
|
|
verbose=config.verbose,
|
|
is_raw_html = True if url.startswith("raw:") else False,
|
|
**kwargs
|
|
)
|
|
|
|
crawl_result.status_code = async_response.status_code
|
|
crawl_result.response_headers = async_response.response_headers
|
|
crawl_result.downloaded_files = async_response.downloaded_files
|
|
crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
|
|
|
# # Check and set values from async_response to crawl_result
|
|
# try:
|
|
# for key in vars(async_response):
|
|
# if hasattr(crawl_result, key):
|
|
# value = getattr(async_response, key, None)
|
|
# current_value = getattr(crawl_result, key, None)
|
|
# if value is not None and not current_value:
|
|
# try:
|
|
# setattr(crawl_result, key, value)
|
|
# except Exception as e:
|
|
# self.logger.warning(
|
|
# message=f"Failed to set attribute {key}: {str(e)}",
|
|
# tag="WARNING"
|
|
# )
|
|
# except Exception as e:
|
|
# self.logger.warning(
|
|
# message=f"Error copying response attributes: {str(e)}",
|
|
# tag="WARNING"
|
|
# )
|
|
|
|
crawl_result.success = bool(html)
|
|
crawl_result.session_id = getattr(config, 'session_id', None)
|
|
|
|
self.logger.success(
|
|
message="{url:.50}... | Status: {status} | Total: {timing}",
|
|
tag="COMPLETE",
|
|
params={
|
|
"url": cache_context.display_url,
|
|
"status": crawl_result.success,
|
|
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
|
},
|
|
colors={
|
|
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
|
"timing": Fore.YELLOW
|
|
}
|
|
)
|
|
|
|
# Update cache if appropriate
|
|
if cache_context.should_write() and not bool(cached_result):
|
|
await async_db_manager.acache_url(crawl_result)
|
|
|
|
return crawl_result
|
|
|
|
else:
|
|
self.logger.success(
|
|
message="{url:.50}... | Status: {status} | Total: {timing}",
|
|
tag="COMPLETE",
|
|
params={
|
|
"url": cache_context.display_url,
|
|
"status": True,
|
|
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
|
},
|
|
colors={
|
|
"status": Fore.GREEN,
|
|
"timing": Fore.YELLOW
|
|
}
|
|
)
|
|
|
|
cached_result.success = bool(html)
|
|
cached_result.session_id = getattr(config, 'session_id', None)
|
|
return cached_result
|
|
|
|
except Exception as e:
|
|
error_context = get_error_context(sys.exc_info())
|
|
|
|
error_message = (
|
|
f"Unexpected error in _crawl_web at line {error_context['line_no']} "
|
|
f"in {error_context['function']} ({error_context['filename']}):\n"
|
|
f"Error: {str(e)}\n\n"
|
|
f"Code context:\n{error_context['code_context']}"
|
|
)
|
|
# if not hasattr(e, "msg"):
|
|
# e.msg = str(e)
|
|
|
|
self.logger.error_status(
|
|
url=url,
|
|
error=create_box_message(error_message, type="error"),
|
|
tag="ERROR"
|
|
)
|
|
|
|
return CrawlResult(
|
|
url=url,
|
|
html="",
|
|
success=False,
|
|
error_message=error_message
|
|
)
|
|
|
|
async def aprocess_html(
|
|
self,
|
|
url: str,
|
|
html: str,
|
|
extracted_content: str,
|
|
config: CrawlerRunConfig,
|
|
screenshot: str,
|
|
pdf_data: str,
|
|
verbose: bool,
|
|
**kwargs,
|
|
) -> CrawlResult:
|
|
"""
|
|
Process HTML content using the provided configuration.
|
|
|
|
Args:
|
|
url: The URL being processed
|
|
html: Raw HTML content
|
|
extracted_content: Previously extracted content (if any)
|
|
config: Configuration object controlling processing behavior
|
|
screenshot: Screenshot data (if any)
|
|
pdf_data: PDF data (if any)
|
|
verbose: Whether to enable verbose logging
|
|
**kwargs: Additional parameters for backwards compatibility
|
|
|
|
Returns:
|
|
CrawlResult: Processed result containing extracted and formatted content
|
|
"""
|
|
try:
|
|
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
|
t1 = time.perf_counter()
|
|
|
|
# Initialize scraping strategy
|
|
scrapping_strategy = WebScrapingStrategy(logger=self.logger)
|
|
|
|
# Process HTML content
|
|
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
|
|
# add keys from kwargs to params that doesn't exist in params
|
|
params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
|
|
|
|
result = scrapping_strategy.scrap(
|
|
url,
|
|
html,
|
|
**params,
|
|
# word_count_threshold=config.word_count_threshold,
|
|
# css_selector=config.css_selector,
|
|
# only_text=config.only_text,
|
|
# image_description_min_word_threshold=config.image_description_min_word_threshold,
|
|
# content_filter=config.content_filter,
|
|
# **kwargs
|
|
)
|
|
|
|
if result is None:
|
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
|
|
|
|
except InvalidCSSSelectorError as e:
|
|
raise ValueError(str(e))
|
|
except Exception as e:
|
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
|
|
|
|
|
|
|
# Extract results
|
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
|
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
|
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
|
media = result.get("media", [])
|
|
links = result.get("links", [])
|
|
metadata = result.get("metadata", {})
|
|
|
|
# Markdown Generation
|
|
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
|
|
|
# Uncomment if by default we want to use PruningContentFilter
|
|
# if not config.content_filter and not markdown_generator.content_filter:
|
|
# markdown_generator.content_filter = PruningContentFilter()
|
|
|
|
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
|
cleaned_html=cleaned_html,
|
|
base_url=url,
|
|
# html2text_options=kwargs.get('html2text', {})
|
|
)
|
|
markdown_v2 = markdown_result
|
|
markdown = sanitize_input_encode(markdown_result.raw_markdown)
|
|
|
|
# Log processing completion
|
|
self.logger.info(
|
|
message="Processed {url:.50}... | Time: {timing}ms",
|
|
tag="SCRAPE",
|
|
params={
|
|
"url": _url,
|
|
"timing": int((time.perf_counter() - t1) * 1000)
|
|
}
|
|
)
|
|
|
|
# Handle content extraction if needed
|
|
if (extracted_content is None and
|
|
config.extraction_strategy and
|
|
config.chunking_strategy and
|
|
not isinstance(config.extraction_strategy, NoExtractionStrategy)):
|
|
|
|
t1 = time.perf_counter()
|
|
|
|
# Choose content based on input_format
|
|
content_format = config.extraction_strategy.input_format
|
|
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
|
self.logger.warning(
|
|
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
|
tag="EXTRACT",
|
|
params={"url": _url}
|
|
)
|
|
content_format = "markdown"
|
|
|
|
content = {
|
|
"markdown": markdown,
|
|
"html": html,
|
|
"fit_markdown": markdown_result.raw_markdown
|
|
}.get(content_format, markdown)
|
|
|
|
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
|
|
chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy
|
|
sections = chunking.chunk(content)
|
|
extracted_content = config.extraction_strategy.run(url, sections)
|
|
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
|
|
|
# Log extraction completion
|
|
self.logger.info(
|
|
message="Completed for {url:.50}... | Time: {timing}s",
|
|
tag="EXTRACT",
|
|
params={
|
|
"url": _url,
|
|
"timing": time.perf_counter() - t1
|
|
}
|
|
)
|
|
|
|
# Handle screenshot and PDF data
|
|
screenshot_data = None if not screenshot else screenshot
|
|
pdf_data = None if not pdf_data else pdf_data
|
|
|
|
# Apply HTML formatting if requested
|
|
if config.prettiify:
|
|
cleaned_html = fast_format_html(cleaned_html)
|
|
|
|
# Return complete crawl result
|
|
return CrawlResult(
|
|
url=url,
|
|
html=html,
|
|
cleaned_html=cleaned_html,
|
|
markdown_v2=markdown_v2,
|
|
markdown=markdown,
|
|
fit_markdown=fit_markdown,
|
|
fit_html=fit_html,
|
|
media=media,
|
|
links=links,
|
|
metadata=metadata,
|
|
screenshot=screenshot_data,
|
|
pdf=pdf_data,
|
|
extracted_content=extracted_content,
|
|
success=True,
|
|
error_message="",
|
|
)
|
|
|
|
async def arun_many(
|
|
self,
|
|
urls: List[str],
|
|
config: Optional[CrawlerRunConfig] = None,
|
|
# Legacy parameters maintained for backwards compatibility
|
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
|
extraction_strategy: ExtractionStrategy = None,
|
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
content_filter: RelevantContentFilter = None,
|
|
cache_mode: Optional[CacheMode] = None,
|
|
bypass_cache: bool = False,
|
|
css_selector: str = None,
|
|
screenshot: bool = False,
|
|
pdf: bool = False,
|
|
user_agent: str = None,
|
|
verbose=True,
|
|
**kwargs,
|
|
) -> List[CrawlResult]:
|
|
"""
|
|
Runs the crawler for multiple URLs concurrently.
|
|
|
|
Migration Guide:
|
|
Old way (deprecated):
|
|
results = await crawler.arun_many(
|
|
urls,
|
|
word_count_threshold=200,
|
|
screenshot=True,
|
|
...
|
|
)
|
|
|
|
New way (recommended):
|
|
config = CrawlerRunConfig(
|
|
word_count_threshold=200,
|
|
screenshot=True,
|
|
...
|
|
)
|
|
results = await crawler.arun_many(urls, crawler_config=config)
|
|
|
|
Args:
|
|
urls: List of URLs to crawl
|
|
crawler_config: Configuration object controlling crawl behavior for all URLs
|
|
[other parameters maintained for backwards compatibility]
|
|
|
|
Returns:
|
|
List[CrawlResult]: Results for each URL
|
|
"""
|
|
crawler_config = config
|
|
# Handle configuration
|
|
if crawler_config is not None:
|
|
if any(param is not None for param in [
|
|
word_count_threshold, extraction_strategy, chunking_strategy,
|
|
content_filter, cache_mode, css_selector, screenshot, pdf
|
|
]):
|
|
self.logger.warning(
|
|
message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
|
tag="WARNING"
|
|
)
|
|
config = crawler_config
|
|
else:
|
|
# Merge all parameters into a single kwargs dict for config creation
|
|
config_kwargs = {
|
|
"word_count_threshold": word_count_threshold,
|
|
"extraction_strategy": extraction_strategy,
|
|
"chunking_strategy": chunking_strategy,
|
|
"content_filter": content_filter,
|
|
"cache_mode": cache_mode,
|
|
"bypass_cache": bypass_cache,
|
|
"css_selector": css_selector,
|
|
"screenshot": screenshot,
|
|
"pdf": pdf,
|
|
"verbose": verbose,
|
|
**kwargs
|
|
}
|
|
config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
|
|
|
if bypass_cache:
|
|
if kwargs.get("warning", True):
|
|
warnings.warn(
|
|
"'bypass_cache' is deprecated and will be removed in version 0.5.0. "
|
|
"Use 'cache_mode=CacheMode.BYPASS' instead. "
|
|
"Pass warning=False to suppress this warning.",
|
|
DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
if config.cache_mode is None:
|
|
config.cache_mode = CacheMode.BYPASS
|
|
|
|
semaphore_count = config.semaphore_count or 5
|
|
semaphore = asyncio.Semaphore(semaphore_count)
|
|
|
|
async def crawl_with_semaphore(url):
|
|
# Handle rate limiting per domain
|
|
domain = urlparse(url).netloc
|
|
current_time = time.time()
|
|
|
|
self.logger.debug(
|
|
message="Started task for {url:.50}...",
|
|
tag="PARALLEL",
|
|
params={"url": url}
|
|
)
|
|
|
|
# Get delay settings from config
|
|
mean_delay = config.mean_delay
|
|
max_range = config.max_range
|
|
|
|
# Apply rate limiting
|
|
if domain in self._domain_last_hit:
|
|
time_since_last = current_time - self._domain_last_hit[domain]
|
|
if time_since_last < mean_delay:
|
|
delay = mean_delay + random.uniform(0, max_range)
|
|
await asyncio.sleep(delay)
|
|
|
|
self._domain_last_hit[domain] = current_time
|
|
|
|
async with semaphore:
|
|
return await self.arun(
|
|
url,
|
|
crawler_config=config, # Pass the entire config object
|
|
user_agent=user_agent # Maintain user_agent override capability
|
|
)
|
|
|
|
# Log start of concurrent crawling
|
|
self.logger.info(
|
|
message="Starting concurrent crawling for {count} URLs...",
|
|
tag="INIT",
|
|
params={"count": len(urls)}
|
|
)
|
|
|
|
# Execute concurrent crawls
|
|
start_time = time.perf_counter()
|
|
tasks = [crawl_with_semaphore(url) for url in urls]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
end_time = time.perf_counter()
|
|
|
|
# Log completion
|
|
self.logger.success(
|
|
message="Concurrent crawling completed for {count} URLs | Total time: {timing}",
|
|
tag="COMPLETE",
|
|
params={
|
|
"count": len(urls),
|
|
"timing": f"{end_time - start_time:.2f}s"
|
|
},
|
|
colors={
|
|
"timing": Fore.YELLOW
|
|
}
|
|
)
|
|
|
|
return [result if not isinstance(result, Exception) else str(result) for result in results]
|
|
|
|
async def aclear_cache(self):
|
|
"""Clear the cache database."""
|
|
await async_db_manager.cleanup()
|
|
|
|
async def aflush_cache(self):
|
|
"""Flush the cache database."""
|
|
await async_db_manager.aflush_db()
|
|
|
|
async def aget_cache_size(self):
|
|
"""Get the total number of cached items."""
|
|
return await async_db_manager.aget_total_count()
|