Refactor: Move adeep_crawl as method of crawler itself. Create attributes in CrawlResult to reconstruct the tree once deep crawling is completed
This commit is contained in:
@@ -10,13 +10,20 @@ import asyncio
|
|||||||
|
|
||||||
# from contextlib import nullcontext, asynccontextmanager
|
# from contextlib import nullcontext, asynccontextmanager
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult
|
|
||||||
|
from .models import (
|
||||||
|
CrawlResult,
|
||||||
|
MarkdownGenerationResult,
|
||||||
|
CrawlerTaskResult,
|
||||||
|
DispatchResult,
|
||||||
|
DeepCrawlingProgress,
|
||||||
|
)
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import * # noqa: F403
|
from .chunking_strategy import * # noqa: F403
|
||||||
from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
|
from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
|
||||||
from .content_filter_strategy import * # noqa: F403
|
from .content_filter_strategy import * # noqa: F403
|
||||||
from .content_filter_strategy import RelevantContentFilter
|
from .content_filter_strategy import RelevantContentFilter
|
||||||
from .extraction_strategy import * # noqa: F403
|
from .extraction_strategy import * # noqa: F403
|
||||||
from .extraction_strategy import NoExtractionStrategy, ExtractionStrategy
|
from .extraction_strategy import NoExtractionStrategy, ExtractionStrategy
|
||||||
from .async_crawler_strategy import (
|
from .async_crawler_strategy import (
|
||||||
AsyncCrawlerStrategy,
|
AsyncCrawlerStrategy,
|
||||||
@@ -30,8 +37,9 @@ from .markdown_generation_strategy import (
|
|||||||
)
|
)
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .async_dispatcher import * # noqa: F403
|
from .async_dispatcher import * # noqa: F403
|
||||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||||
|
from .traversal import TraversalStrategy
|
||||||
|
|
||||||
from .config import MIN_WORD_THRESHOLD
|
from .config import MIN_WORD_THRESHOLD
|
||||||
from .utils import (
|
from .utils import (
|
||||||
@@ -46,7 +54,7 @@ from .utils import (
|
|||||||
from typing import Union, AsyncGenerator, List, TypeVar
|
from typing import Union, AsyncGenerator, List, TypeVar
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
|
|
||||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
|
||||||
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||||
|
|
||||||
from .__version__ import __version__ as crawl4ai_version
|
from .__version__ import __version__ as crawl4ai_version
|
||||||
@@ -257,7 +265,7 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def nullcontext(self):
|
async def nullcontext(self):
|
||||||
"""异步空上下文管理器"""
|
"""Asynchronous null context manager"""
|
||||||
yield
|
yield
|
||||||
|
|
||||||
async def arun(
|
async def arun(
|
||||||
@@ -420,14 +428,18 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Check robots.txt if enabled
|
# Check robots.txt if enabled
|
||||||
if config and config.check_robots_txt:
|
if config and config.check_robots_txt:
|
||||||
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
|
if not await self.robots_parser.can_fetch(
|
||||||
|
url, self.browser_config.user_agent
|
||||||
|
):
|
||||||
return CrawlResult(
|
return CrawlResult(
|
||||||
url=url,
|
url=url,
|
||||||
html="",
|
html="",
|
||||||
success=False,
|
success=False,
|
||||||
status_code=403,
|
status_code=403,
|
||||||
error_message="Access denied by robots.txt",
|
error_message="Access denied by robots.txt",
|
||||||
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
response_headers={
|
||||||
|
"X-Robots-Status": "Blocked by robots.txt"
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Pass config to crawl method
|
# Pass config to crawl method
|
||||||
@@ -449,7 +461,7 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Process the HTML content
|
# Process the HTML content
|
||||||
crawl_result : CrawlResult = await self.aprocess_html(
|
crawl_result: CrawlResult = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=html,
|
html=html,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
@@ -731,8 +743,8 @@ class AsyncWebCrawler:
|
|||||||
pdf: bool = False,
|
pdf: bool = False,
|
||||||
user_agent: str = None,
|
user_agent: str = None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs
|
**kwargs,
|
||||||
) -> RunManyReturn:
|
) -> RunManyReturn:
|
||||||
"""
|
"""
|
||||||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||||
|
|
||||||
@@ -786,7 +798,9 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
transform_result = lambda task_result: (
|
transform_result = lambda task_result: (
|
||||||
setattr(task_result.result, 'dispatch_result',
|
setattr(
|
||||||
|
task_result.result,
|
||||||
|
"dispatch_result",
|
||||||
DispatchResult(
|
DispatchResult(
|
||||||
task_id=task_result.task_id,
|
task_id=task_result.task_id,
|
||||||
memory_usage=task_result.memory_usage,
|
memory_usage=task_result.memory_usage,
|
||||||
@@ -794,21 +808,60 @@ class AsyncWebCrawler:
|
|||||||
start_time=task_result.start_time,
|
start_time=task_result.start_time,
|
||||||
end_time=task_result.end_time,
|
end_time=task_result.end_time,
|
||||||
error_message=task_result.error_message,
|
error_message=task_result.error_message,
|
||||||
)
|
),
|
||||||
) or task_result.result
|
)
|
||||||
|
or task_result.result
|
||||||
)
|
)
|
||||||
|
|
||||||
stream = config.stream
|
stream = config.stream
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
|
||||||
async def result_transformer():
|
async def result_transformer():
|
||||||
async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
|
async for task_result in dispatcher.run_urls_stream(
|
||||||
|
crawler=self, urls=urls, config=config
|
||||||
|
):
|
||||||
yield transform_result(task_result)
|
yield transform_result(task_result)
|
||||||
|
|
||||||
return result_transformer()
|
return result_transformer()
|
||||||
else:
|
else:
|
||||||
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
||||||
return [transform_result(res) for res in _results]
|
return [transform_result(res) for res in _results]
|
||||||
|
|
||||||
|
async def adeep_crawl(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
strategy: TraversalStrategy,
|
||||||
|
crawler_run_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
stream: Optional[bool] = False,
|
||||||
|
) -> Union[AsyncGenerator[CrawlResult,None],List[CrawlResult]]:
|
||||||
|
"""
|
||||||
|
Traverse child URLs starting from the given URL, based on Traversal strategy
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Starting URL for scraping
|
||||||
|
strategy: Traversal strategy to use
|
||||||
|
crawler_config: Configuration object controlling crawl behavior
|
||||||
|
stream (bool, optional): Whether to stream the results. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of CrawlResults
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result_generator = strategy.deep_crawl(
|
||||||
|
url, crawler=self, crawler_run_config=crawler_run_config
|
||||||
|
)
|
||||||
|
if stream:
|
||||||
|
return result_generator
|
||||||
|
else:
|
||||||
|
results = []
|
||||||
|
async for result in result_generator:
|
||||||
|
results.append(result)
|
||||||
|
return results
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error in streaming Deep Crawl: {str(e)}")
|
||||||
|
raise
|
||||||
|
|
||||||
async def aclear_cache(self):
|
async def aclear_cache(self):
|
||||||
"""Clear the cache database."""
|
"""Clear the cache database."""
|
||||||
await async_db_manager.cleanup()
|
await async_db_manager.cleanup()
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import annotations
|
||||||
from pydantic import BaseModel, HttpUrl
|
from pydantic import BaseModel, HttpUrl
|
||||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@@ -5,6 +6,7 @@ from dataclasses import dataclass
|
|||||||
from .ssl_certificate import SSLCertificate
|
from .ssl_certificate import SSLCertificate
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
from math import inf
|
||||||
|
|
||||||
|
|
||||||
###############################
|
###############################
|
||||||
@@ -95,6 +97,18 @@ class DispatchResult(BaseModel):
|
|||||||
error_message: str = ""
|
error_message: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TraversalStats:
|
||||||
|
"""Statistics for the traversal process"""
|
||||||
|
|
||||||
|
start_time: datetime
|
||||||
|
urls_processed: int = 0
|
||||||
|
urls_failed: int = 0
|
||||||
|
urls_skipped: int = 0
|
||||||
|
total_depth_reached: int = 0
|
||||||
|
current_depth: int = 0
|
||||||
|
|
||||||
|
|
||||||
class CrawlResult(BaseModel):
|
class CrawlResult(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
html: str
|
html: str
|
||||||
@@ -118,6 +132,12 @@ class CrawlResult(BaseModel):
|
|||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
dispatch_result: Optional[DispatchResult] = None
|
dispatch_result: Optional[DispatchResult] = None
|
||||||
redirected_url: Optional[str] = None
|
redirected_url: Optional[str] = None
|
||||||
|
# Attributes for position
|
||||||
|
depth: Optional[int] = None
|
||||||
|
score: Optional[float] = -inf
|
||||||
|
# For referencing children and parents from a flattened list of CrawlResult elements
|
||||||
|
parent_url: Optional[str] = None
|
||||||
|
child_urls: Optional[List[str]] = None
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
@@ -161,12 +181,12 @@ class Link(BaseModel):
|
|||||||
|
|
||||||
class Media(BaseModel):
|
class Media(BaseModel):
|
||||||
images: List[MediaItem] = []
|
images: List[MediaItem] = []
|
||||||
videos: List[
|
videos: List[MediaItem] = (
|
||||||
MediaItem
|
[]
|
||||||
] = [] # Using MediaItem model for now, can be extended with Video model if needed
|
) # Using MediaItem model for now, can be extended with Video model if needed
|
||||||
audios: List[
|
audios: List[MediaItem] = (
|
||||||
MediaItem
|
[]
|
||||||
] = [] # Using MediaItem model for now, can be extended with Audio model if needed
|
) # Using MediaItem model for now, can be extended with Audio model if needed
|
||||||
|
|
||||||
|
|
||||||
class Links(BaseModel):
|
class Links(BaseModel):
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
from .async_web_scraper import AsyncWebScraper
|
|
||||||
from .bfs_scraper_strategy import BFSScraperStrategy
|
|
||||||
from .filters import (
|
|
||||||
URLFilter,
|
|
||||||
FilterChain,
|
|
||||||
URLPatternFilter,
|
|
||||||
ContentTypeFilter,
|
|
||||||
DomainFilter,
|
|
||||||
)
|
|
||||||
from .scorers import (
|
|
||||||
KeywordRelevanceScorer,
|
|
||||||
PathDepthScorer,
|
|
||||||
FreshnessScorer,
|
|
||||||
CompositeScorer,
|
|
||||||
)
|
|
||||||
from .scraper_strategy import ScraperStrategy
|
|
||||||
@@ -1,149 +0,0 @@
|
|||||||
from typing import Union, AsyncGenerator, Optional
|
|
||||||
from .scraper_strategy import ScraperStrategy
|
|
||||||
from .models import ScraperResult, CrawlResult, ScraperPageResult
|
|
||||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from contextlib import AbstractAsyncContextManager
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ScrapingProgress:
|
|
||||||
"""Tracks the progress of a scraping operation."""
|
|
||||||
|
|
||||||
processed_urls: int = 0
|
|
||||||
failed_urls: int = 0
|
|
||||||
current_url: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncWebScraper(AbstractAsyncContextManager):
|
|
||||||
"""
|
|
||||||
A high-level web scraper that combines an async crawler with a scraping strategy.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
crawler_config (CrawlerRunConfig): Configuration for the crawler run
|
|
||||||
browser_config (BrowserConfig): Configuration for the browser
|
|
||||||
strategy (ScraperStrategy): The scraping strategy to use
|
|
||||||
logger (Optional[logging.Logger]): Custom logger for the scraper
|
|
||||||
"""
|
|
||||||
|
|
||||||
async def __aenter__(self):
|
|
||||||
# Initialize resources, if any
|
|
||||||
self.logger.info("Starting the async web scraper.")
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
strategy: ScraperStrategy,
|
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
|
||||||
logger: Optional[logging.Logger] = None,
|
|
||||||
):
|
|
||||||
if not isinstance(strategy, ScraperStrategy):
|
|
||||||
raise TypeError("strategy must be an instance of ScraperStrategy")
|
|
||||||
if browser_config is not None and not isinstance(browser_config, BrowserConfig):
|
|
||||||
raise TypeError(
|
|
||||||
"browser_config must be None or an instance of BrowserConfig"
|
|
||||||
)
|
|
||||||
if crawler_config is not None and not isinstance(
|
|
||||||
crawler_config, CrawlerRunConfig
|
|
||||||
):
|
|
||||||
raise TypeError(
|
|
||||||
"crawler_config must be None or an instance of CrawlerRunConfig"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.crawler_config = crawler_config
|
|
||||||
self.browser_config = browser_config
|
|
||||||
self.strategy = strategy
|
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
|
||||||
self._progress = ScrapingProgress()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def progress(self) -> ScrapingProgress:
|
|
||||||
"""Get current scraping progress."""
|
|
||||||
return self._progress
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def _error_handling_context(self, url: str):
|
|
||||||
"""Context manager for handling errors during scraping."""
|
|
||||||
try:
|
|
||||||
yield
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error scraping {url}: {str(e)}")
|
|
||||||
self._progress.failed_urls += 1
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def ascrape(
|
|
||||||
self, url: str, stream: bool = False
|
|
||||||
) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]:
|
|
||||||
"""
|
|
||||||
Scrape a website starting from the given URL.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: Starting URL for scraping
|
|
||||||
stream: If True, yield results as they come; if False, collect all results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Either an async generator yielding CrawlResults or a final ScraperResult
|
|
||||||
"""
|
|
||||||
self._progress = ScrapingProgress() # Reset progress
|
|
||||||
async with self._error_handling_context(url):
|
|
||||||
if stream:
|
|
||||||
return self._ascrape_yielding(url)
|
|
||||||
return await self._ascrape_collecting(url)
|
|
||||||
|
|
||||||
async def _ascrape_yielding(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
) -> AsyncGenerator[ScraperPageResult, None]:
|
|
||||||
"""Stream scraping results as they become available."""
|
|
||||||
try:
|
|
||||||
result_generator = self.strategy.ascrape(
|
|
||||||
url, self.crawler_config, self.browser_config
|
|
||||||
)
|
|
||||||
async for page_result in result_generator:
|
|
||||||
self._progress.processed_urls += 1
|
|
||||||
self._progress.current_url = page_result.result.url
|
|
||||||
yield page_result
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error in streaming scrape: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def _ascrape_collecting(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
) -> ScraperResult:
|
|
||||||
"""Collect all scraping results before returning."""
|
|
||||||
extracted_data = {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
result_generator = self.strategy.ascrape(
|
|
||||||
url, self.crawler_config, self.browser_config
|
|
||||||
)
|
|
||||||
async for res in result_generator:
|
|
||||||
url = res.result.url
|
|
||||||
self._progress.processed_urls += 1
|
|
||||||
self._progress.current_url = url
|
|
||||||
extracted_data[url] = res
|
|
||||||
|
|
||||||
return ScraperResult(
|
|
||||||
url=url,
|
|
||||||
crawled_urls=list(extracted_data.keys()),
|
|
||||||
extracted_data=extracted_data,
|
|
||||||
stats={
|
|
||||||
"processed_urls": self._progress.processed_urls,
|
|
||||||
"failed_urls": self._progress.failed_urls,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error in collecting scrape: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
# Cleanup resources or tasks
|
|
||||||
await self.close() # Assuming you have a close method to cleanup
|
|
||||||
|
|
||||||
async def close(self):
|
|
||||||
# Perform cleanup tasks
|
|
||||||
pass
|
|
||||||
@@ -1,209 +0,0 @@
|
|||||||
from typing import AsyncGenerator, Optional, Dict, Set
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from ..async_webcrawler import AsyncWebCrawler
|
|
||||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
|
||||||
from .models import CrawlResult, ScraperPageResult
|
|
||||||
from .filters import FilterChain
|
|
||||||
from .scorers import URLScorer
|
|
||||||
from .scraper_strategy import ScraperStrategy
|
|
||||||
from ..config import SCRAPER_BATCH_SIZE
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class CrawlStats:
|
|
||||||
"""Statistics for the crawling process"""
|
|
||||||
|
|
||||||
start_time: datetime
|
|
||||||
urls_processed: int = 0
|
|
||||||
urls_failed: int = 0
|
|
||||||
urls_skipped: int = 0
|
|
||||||
total_depth_reached: int = 0
|
|
||||||
current_depth: int = 0
|
|
||||||
|
|
||||||
|
|
||||||
class BFSScraperStrategy(ScraperStrategy):
|
|
||||||
"""Breadth-First Search scraping strategy with politeness controls"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
max_depth: int,
|
|
||||||
filter_chain: FilterChain,
|
|
||||||
url_scorer: URLScorer,
|
|
||||||
process_external_links: bool = False,
|
|
||||||
logger: Optional[logging.Logger] = None,
|
|
||||||
):
|
|
||||||
self.max_depth = max_depth
|
|
||||||
self.filter_chain = filter_chain
|
|
||||||
self.url_scorer = url_scorer
|
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Crawl control
|
|
||||||
self.stats = CrawlStats(start_time=datetime.now())
|
|
||||||
self._cancel_event = asyncio.Event()
|
|
||||||
self.process_external_links = process_external_links
|
|
||||||
|
|
||||||
async def can_process_url(self, url: str, depth: int) -> bool:
|
|
||||||
"""Check if URL can be processed based on filters
|
|
||||||
This is our gatekeeper method that determines if a URL should be processed. It:
|
|
||||||
- Validates URL format using a robust built-in method
|
|
||||||
- Applies custom filters from the filter chain
|
|
||||||
- Updates statistics for blocked URLs
|
|
||||||
- Returns False early if any check fails
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
result = urlparse(url)
|
|
||||||
if not all([result.scheme, result.netloc]):
|
|
||||||
raise ValueError("Invalid URL")
|
|
||||||
if result.scheme not in ("http", "https"):
|
|
||||||
raise ValueError("URL must be HTTP or HTTPS")
|
|
||||||
if not result.netloc or "." not in result.netloc:
|
|
||||||
raise ValueError("Invalid domain")
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Invalid URL: {url}. Error: {str(e)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Apply the filter chain if it's not start page
|
|
||||||
if depth != 0 and not self.filter_chain.apply(url):
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
async def _process_links(
|
|
||||||
self,
|
|
||||||
result: CrawlResult,
|
|
||||||
source_url: str,
|
|
||||||
queue: asyncio.PriorityQueue,
|
|
||||||
visited: Set[str],
|
|
||||||
depths: Dict[str, int],
|
|
||||||
):
|
|
||||||
"""Process extracted links from crawl result.
|
|
||||||
This is our link processor that:
|
|
||||||
Checks depth limits
|
|
||||||
Handles both internal and external links
|
|
||||||
Checks if URL is visited already
|
|
||||||
Checks if URL can be processed - validates URL, applies Filters with can_process_url
|
|
||||||
Scores URLs for priority
|
|
||||||
Updates depth tracking dictionary
|
|
||||||
Adds valid URLs to the queue
|
|
||||||
Updates maximum depth statistics
|
|
||||||
"""
|
|
||||||
next_depth = depths[source_url] + 1
|
|
||||||
# If depth limit reached, exit without processing links
|
|
||||||
if next_depth > self.max_depth:
|
|
||||||
return
|
|
||||||
links_to_process = result.links["internal"]
|
|
||||||
if self.process_external_links:
|
|
||||||
links_to_process += result.links["external"]
|
|
||||||
for link in links_to_process:
|
|
||||||
url = link["href"]
|
|
||||||
if url in visited:
|
|
||||||
continue
|
|
||||||
if not await self.can_process_url(url, next_depth):
|
|
||||||
self.stats.urls_skipped += 1
|
|
||||||
continue
|
|
||||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
|
||||||
await queue.put((score, next_depth, url))
|
|
||||||
depths[url] = next_depth
|
|
||||||
self.stats.total_depth_reached = max(
|
|
||||||
self.stats.total_depth_reached, next_depth
|
|
||||||
)
|
|
||||||
|
|
||||||
async def ascrape(
|
|
||||||
self,
|
|
||||||
start_url: str,
|
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
|
||||||
) -> AsyncGenerator[CrawlResult, None]:
|
|
||||||
"""Implement BFS crawling strategy"""
|
|
||||||
|
|
||||||
# Initialize crawl state
|
|
||||||
"""
|
|
||||||
queue: A priority queue where items are tuples of (score, depth, url)
|
|
||||||
Score: Determines crawling priority (lower = higher priority)
|
|
||||||
Depth: Current distance from start_url
|
|
||||||
URL: The actual URL to crawl
|
|
||||||
visited: Keeps track of URLs we've already seen to avoid cycles
|
|
||||||
depths: Maps URLs to their depths from the start URL
|
|
||||||
active_crawls: Tracks currently running crawl tasks
|
|
||||||
"""
|
|
||||||
queue = asyncio.PriorityQueue()
|
|
||||||
await queue.put((0, 0, start_url))
|
|
||||||
visited: Set[str] = set()
|
|
||||||
depths = {start_url: 0}
|
|
||||||
active_crawls = {} # Track URLs currently being processed with depth and score
|
|
||||||
active_crawls_lock = asyncio.Lock() # Create the lock within the same event loop
|
|
||||||
|
|
||||||
# Update crawler_config to stream back results to scraper
|
|
||||||
crawler_config = crawler_config.clone(stream=True) if crawler_config else CrawlerRunConfig(stream=True)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(
|
|
||||||
config=browser_config,
|
|
||||||
verbose=True,
|
|
||||||
) as crawler:
|
|
||||||
try:
|
|
||||||
while (
|
|
||||||
not queue.empty() or active_crawls
|
|
||||||
) and not self._cancel_event.is_set():
|
|
||||||
"""
|
|
||||||
This sets up our main control loop which:
|
|
||||||
- Continues while there are URLs to process (not queue.empty())
|
|
||||||
- Or while there are active crawls still running (arun_many)
|
|
||||||
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
|
||||||
"""
|
|
||||||
# Collect batch of URLs into active_crawls to process
|
|
||||||
async with active_crawls_lock:
|
|
||||||
while len(active_crawls) < SCRAPER_BATCH_SIZE and not queue.empty():
|
|
||||||
score, depth, url = await queue.get()
|
|
||||||
active_crawls[url] = {"depth": depth, "score": score}
|
|
||||||
self.stats.current_depth = depth
|
|
||||||
|
|
||||||
if not active_crawls:
|
|
||||||
# If no active crawls exist, wait a bit and continue
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
continue
|
|
||||||
# Process batch
|
|
||||||
try:
|
|
||||||
async for result in await crawler.arun_many(
|
|
||||||
urls=list(active_crawls.keys()),
|
|
||||||
config=crawler_config.clone(stream=True),
|
|
||||||
):
|
|
||||||
source_url = result.url
|
|
||||||
depth = active_crawls[source_url]["depth"]
|
|
||||||
score=active_crawls[source_url]["score"]
|
|
||||||
async with active_crawls_lock:
|
|
||||||
active_crawls.pop(source_url, None)
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
await self._process_links(
|
|
||||||
result, source_url, queue, visited, depths
|
|
||||||
)
|
|
||||||
yield ScraperPageResult(
|
|
||||||
result = result,
|
|
||||||
depth=depth,
|
|
||||||
score=score,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.logger.warning(
|
|
||||||
f"Failed to crawl {result.url}: {result.error_message}"
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Batch processing error: {e}")
|
|
||||||
# Continue processing other batches
|
|
||||||
continue
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error in crawl process: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
finally:
|
|
||||||
self.stats.end_time = datetime.now()
|
|
||||||
await crawler.close()
|
|
||||||
|
|
||||||
async def shutdown(self):
|
|
||||||
"""Clean up resources and stop crawling"""
|
|
||||||
self._cancel_event.set()
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
from pydantic import BaseModel
|
|
||||||
from typing import List, Dict
|
|
||||||
from ..models import CrawlResult
|
|
||||||
|
|
||||||
class ScraperPageResult(BaseModel):
|
|
||||||
result: CrawlResult
|
|
||||||
depth: int
|
|
||||||
score: float
|
|
||||||
class ScraperResult(BaseModel):
|
|
||||||
url: str
|
|
||||||
crawled_urls: List[str]
|
|
||||||
extracted_data: Dict[str, ScraperPageResult]
|
|
||||||
@@ -1,40 +0,0 @@
|
|||||||
from abc import ABC, abstractmethod
|
|
||||||
from .models import ScraperResult, ScraperPageResult
|
|
||||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
|
||||||
from typing import Union, AsyncGenerator
|
|
||||||
class ScraperStrategy(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
async def ascrape(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
crawler_config: CrawlerRunConfig,
|
|
||||||
browser_config: BrowserConfig,
|
|
||||||
stream: bool = False,
|
|
||||||
) -> Union[AsyncGenerator[ScraperPageResult, None], ScraperResult]:
|
|
||||||
"""Scrape the given URL using the specified crawler.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The starting URL for the scrape.
|
|
||||||
crawler_config (CrawlerRunConfig): Configuration for the crawler run.
|
|
||||||
browser_config (BrowserConfig): Configuration for the browser.
|
|
||||||
stream (bool): If True, yields individual crawl results as they are ready;
|
|
||||||
if False, accumulates results and returns a final ScraperResult.
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
ScraperPageResult: Individual page results if stream is True.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ScraperResult: A summary of the scrape results containing the final extracted data
|
|
||||||
and the list of crawled URLs if stream is False.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def can_process_url(self, url: str, depth: int) -> bool:
|
|
||||||
"""Check if URL can be processed based on strategy rules"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
async def shutdown(self):
|
|
||||||
"""Clean up resources used by the strategy"""
|
|
||||||
pass
|
|
||||||
29
crawl4ai/traversal/__init__.py
Normal file
29
crawl4ai/traversal/__init__.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
from .bfs_traversal_strategy import BFSTraversalStrategy
|
||||||
|
from .filters import (
|
||||||
|
URLFilter,
|
||||||
|
FilterChain,
|
||||||
|
URLPatternFilter,
|
||||||
|
ContentTypeFilter,
|
||||||
|
DomainFilter,
|
||||||
|
)
|
||||||
|
from .scorers import (
|
||||||
|
KeywordRelevanceScorer,
|
||||||
|
PathDepthScorer,
|
||||||
|
FreshnessScorer,
|
||||||
|
CompositeScorer,
|
||||||
|
)
|
||||||
|
from .traversal_strategy import TraversalStrategy
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BFSTraversalStrategy",
|
||||||
|
"FilterChain",
|
||||||
|
"URLFilter",
|
||||||
|
"URLPatternFilter",
|
||||||
|
"ContentTypeFilter",
|
||||||
|
"DomainFilter",
|
||||||
|
"KeywordRelevanceScorer",
|
||||||
|
"PathDepthScorer",
|
||||||
|
"FreshnessScorer",
|
||||||
|
"CompositeScorer",
|
||||||
|
"TraversalStrategy",
|
||||||
|
]
|
||||||
197
crawl4ai/traversal/bfs_traversal_strategy.py
Normal file
197
crawl4ai/traversal/bfs_traversal_strategy.py
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
from typing import AsyncGenerator, Optional, Dict, Set, List
|
||||||
|
from datetime import datetime
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from ..async_configs import CrawlerRunConfig
|
||||||
|
from ..models import CrawlResult, TraversalStats
|
||||||
|
from .filters import FilterChain
|
||||||
|
from .scorers import URLScorer
|
||||||
|
from .traversal_strategy import TraversalStrategy
|
||||||
|
from ..config import SCRAPER_BATCH_SIZE
|
||||||
|
|
||||||
|
|
||||||
|
class BFSTraversalStrategy(TraversalStrategy):
|
||||||
|
"""Best-First Search traversal strategy with filtering and scoring."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_depth: int,
|
||||||
|
filter_chain: FilterChain,
|
||||||
|
url_scorer: URLScorer,
|
||||||
|
process_external_links: bool = False,
|
||||||
|
logger: Optional[logging.Logger] = None,
|
||||||
|
):
|
||||||
|
self.max_depth = max_depth
|
||||||
|
self.filter_chain = filter_chain
|
||||||
|
self.url_scorer = url_scorer
|
||||||
|
self.logger = logger or logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Crawl control
|
||||||
|
self.stats = TraversalStats(start_time=datetime.now())
|
||||||
|
self._cancel_event = asyncio.Event()
|
||||||
|
self.process_external_links = process_external_links
|
||||||
|
|
||||||
|
async def can_process_url(self, url: str, depth: int) -> bool:
|
||||||
|
"""Check if URL can be processed based on filters
|
||||||
|
This is our gatekeeper method that determines if a URL should be processed. It:
|
||||||
|
- Validates URL format using a robust built-in method
|
||||||
|
- Applies custom filters from the filter chain
|
||||||
|
- Updates statistics for blocked URLs
|
||||||
|
- Returns False early if any check fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = urlparse(url)
|
||||||
|
if not all([result.scheme, result.netloc]):
|
||||||
|
raise ValueError("Invalid URL")
|
||||||
|
if result.scheme not in ("http", "https"):
|
||||||
|
raise ValueError("URL must be HTTP or HTTPS")
|
||||||
|
if not result.netloc or "." not in result.netloc:
|
||||||
|
raise ValueError("Invalid domain")
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Invalid URL: {url}. Error: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Apply the filter chain if it's not start page
|
||||||
|
if depth != 0 and not self.filter_chain.apply(url):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
async def _process_links(
|
||||||
|
self,
|
||||||
|
result: CrawlResult,
|
||||||
|
source_url: str,
|
||||||
|
queue: asyncio.PriorityQueue,
|
||||||
|
visited: Set[str],
|
||||||
|
depths: Dict[str, int],
|
||||||
|
) -> List[str]:
|
||||||
|
"""Process extracted links from crawl result.
|
||||||
|
This is our link processor that:
|
||||||
|
Checks depth limits
|
||||||
|
Handles both internal and external links
|
||||||
|
Checks if URL is visited already
|
||||||
|
Checks if URL can be processed - validates URL, applies Filters with can_process_url
|
||||||
|
Scores URLs for priority
|
||||||
|
Updates depth tracking dictionary
|
||||||
|
Adds valid URLs to the queue
|
||||||
|
Updates maximum depth statistics
|
||||||
|
"""
|
||||||
|
next_depth = depths[source_url] + 1
|
||||||
|
# If depth limit reached, exit without processing links
|
||||||
|
if next_depth > self.max_depth:
|
||||||
|
return
|
||||||
|
links_to_process = result.links["internal"]
|
||||||
|
if self.process_external_links:
|
||||||
|
links_to_process += result.links["external"]
|
||||||
|
child_urls = []
|
||||||
|
for link in links_to_process:
|
||||||
|
url = link["href"]
|
||||||
|
if url in visited:
|
||||||
|
continue
|
||||||
|
if not await self.can_process_url(url, next_depth):
|
||||||
|
self.stats.urls_skipped += 1
|
||||||
|
continue
|
||||||
|
score = self.url_scorer.score(url) if self.url_scorer else 0
|
||||||
|
child_urls.append(url)
|
||||||
|
await queue.put((score, next_depth, url, source_url))
|
||||||
|
depths[url] = next_depth
|
||||||
|
self.stats.total_depth_reached = max(
|
||||||
|
self.stats.total_depth_reached, next_depth
|
||||||
|
)
|
||||||
|
return child_urls
|
||||||
|
|
||||||
|
async def deep_crawl(
|
||||||
|
self,
|
||||||
|
start_url: str,
|
||||||
|
crawler: "AsyncWebCrawler",
|
||||||
|
crawler_run_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
) -> AsyncGenerator[CrawlResult, None]:
|
||||||
|
"""Implement BFS traversal strategy"""
|
||||||
|
|
||||||
|
# Initialize traversal state
|
||||||
|
"""
|
||||||
|
queue: A priority queue where items are tuples of (score, depth, url)
|
||||||
|
Score: Determines traversal priority (lower = higher priority)
|
||||||
|
Depth: Current distance from start_url
|
||||||
|
URL: The actual URL to crawl
|
||||||
|
visited: Keeps track of URLs we've already seen to avoid cycles
|
||||||
|
depths: Maps URLs to their depths from the start URL
|
||||||
|
active_crawls: Tracks currently running crawl tasks
|
||||||
|
"""
|
||||||
|
queue = asyncio.PriorityQueue()
|
||||||
|
await queue.put((0, 0, start_url, None))
|
||||||
|
visited: Set[str] = set()
|
||||||
|
depths = {start_url: 0}
|
||||||
|
active_crawls = {} # Track URLs currently being processed with depth and score
|
||||||
|
active_crawls_lock = (
|
||||||
|
asyncio.Lock()
|
||||||
|
) # Create the lock within the same event loop
|
||||||
|
try:
|
||||||
|
while (
|
||||||
|
not queue.empty() or active_crawls
|
||||||
|
) and not self._cancel_event.is_set():
|
||||||
|
"""
|
||||||
|
This sets up our main control loop which:
|
||||||
|
- Continues while there are URLs to process (not queue.empty())
|
||||||
|
- Or while there are active crawls still running (arun_many)
|
||||||
|
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
||||||
|
"""
|
||||||
|
# Collect batch of URLs into active_crawls to process
|
||||||
|
async with active_crawls_lock:
|
||||||
|
while len(active_crawls) < SCRAPER_BATCH_SIZE and not queue.empty():
|
||||||
|
score, depth, url, parent_url = await queue.get()
|
||||||
|
active_crawls[url] = {
|
||||||
|
"depth": depth,
|
||||||
|
"score": score,
|
||||||
|
"parent_url": parent_url,
|
||||||
|
}
|
||||||
|
self.stats.current_depth = depth
|
||||||
|
|
||||||
|
if not active_crawls:
|
||||||
|
# If no active crawls exist, wait a bit and continue
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
continue
|
||||||
|
# Process batch
|
||||||
|
try:
|
||||||
|
stream_config = (
|
||||||
|
crawler_run_config.clone(stream=True)
|
||||||
|
if crawler_run_config
|
||||||
|
else CrawlerRunConfig(stream=True)
|
||||||
|
)
|
||||||
|
async for result in await crawler.arun_many(
|
||||||
|
urls=list(active_crawls.keys()),
|
||||||
|
config=stream_config,
|
||||||
|
):
|
||||||
|
async with active_crawls_lock:
|
||||||
|
crawl_info = active_crawls.pop(result.url, None)
|
||||||
|
|
||||||
|
if crawl_info and result.success:
|
||||||
|
child_urls = await self._process_links(
|
||||||
|
result, result.url, queue, visited, depths
|
||||||
|
)
|
||||||
|
result.depth = crawl_info["depth"]
|
||||||
|
result.score = crawl_info["score"]
|
||||||
|
result.parent_url = crawl_info["parent_url"]
|
||||||
|
result.child_urls = child_urls
|
||||||
|
yield result
|
||||||
|
else:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Failed to crawl {result.url}: {result.error_message}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Batch processing error: {e}")
|
||||||
|
# Continue processing other batches
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Error in crawl process: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
finally:
|
||||||
|
self.stats.end_time = datetime.now()
|
||||||
|
await crawler.close()
|
||||||
|
|
||||||
|
async def shutdown(self):
|
||||||
|
"""Clean up resources and stop crawling"""
|
||||||
|
self._cancel_event.set()
|
||||||
@@ -1,9 +1,8 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Dict, Optional, Union
|
from typing import List, Dict, Optional
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from urllib.parse import urlparse, unquote
|
from urllib.parse import urlparse, unquote
|
||||||
import re
|
import re
|
||||||
from collections import defaultdict
|
|
||||||
import math
|
import math
|
||||||
import logging
|
import logging
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
31
crawl4ai/traversal/traversal_strategy.py
Normal file
31
crawl4ai/traversal/traversal_strategy.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
|
||||||
|
from ..async_configs import CrawlerRunConfig
|
||||||
|
from ..models import CrawlResult
|
||||||
|
|
||||||
|
|
||||||
|
class TraversalStrategy(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def deep_crawl(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
crawler: "AsyncWebCrawler",
|
||||||
|
crawler_run_config: CrawlerRunConfig = None,
|
||||||
|
) -> AsyncGenerator[CrawlResult, None]:
|
||||||
|
"""Traverse the given URL using the specified crawler.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The starting URL for the traversal.
|
||||||
|
crawler (AsyncWebCrawler): The crawler instance to use for traversal.
|
||||||
|
crawler_run_config (CrawlerRunConfig, optional): The configuration for the crawler.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def shutdown(self):
|
||||||
|
"""Clean up resources used by the strategy"""
|
||||||
|
pass
|
||||||
@@ -1,20 +1,18 @@
|
|||||||
# basic_scraper_example.py
|
# basic_scraper_example.py
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
||||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||||
from crawl4ai.scraper import (
|
from crawl4ai.traversal import (
|
||||||
AsyncWebScraper,
|
BFSTraversalStrategy,
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
FilterChain,
|
||||||
URLPatternFilter,
|
URLPatternFilter,
|
||||||
ContentTypeFilter,
|
ContentTypeFilter,
|
||||||
)
|
)
|
||||||
from crawl4ai.async_webcrawler import BrowserConfig
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
||||||
|
|
||||||
|
|
||||||
async def basic_scraper_example():
|
async def basic_scraper_example():
|
||||||
"""
|
"""
|
||||||
Basic example: Scrape a blog site for articles
|
Basic example: Scrape a blog site for articles
|
||||||
@@ -33,7 +31,7 @@ async def basic_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Initialize the strategy with basic configuration
|
# Initialize the strategy with basic configuration
|
||||||
bfs_strategy = BFSScraperStrategy(
|
bfs_strategy = BFSTraversalStrategy(
|
||||||
max_depth=2, # Only go 2 levels deep
|
max_depth=2, # Only go 2 levels deep
|
||||||
filter_chain=filter_chain,
|
filter_chain=filter_chain,
|
||||||
url_scorer=None, # Use default scoring
|
url_scorer=None, # Use default scoring
|
||||||
@@ -41,17 +39,18 @@ async def basic_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create the crawler and scraper
|
# Create the crawler and scraper
|
||||||
async with AsyncWebScraper(
|
async with AsyncWebCrawler(
|
||||||
strategy=bfs_strategy,
|
config=browser_config,
|
||||||
) as scraper:
|
) as crawler:
|
||||||
# Start scraping
|
# Start scraping
|
||||||
try:
|
try:
|
||||||
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
|
results = await crawler.adeep_crawl(
|
||||||
|
"https://crawl4ai.com/mkdocs", strategy=bfs_strategy
|
||||||
|
)
|
||||||
# Process results
|
# Process results
|
||||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
print(f"Crawled {len(results)} pages:")
|
||||||
for url, page_result in result.extracted_data.items():
|
for result in results:
|
||||||
print(f"- {url}: {len(page_result.result.html)} bytes")
|
print(f"- {result.url}: {len(result.html)} bytes")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error during scraping: {e}")
|
print(f"Error during scraping: {e}")
|
||||||
@@ -60,9 +59,8 @@ async def basic_scraper_example():
|
|||||||
# advanced_scraper_example.py
|
# advanced_scraper_example.py
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from crawl4ai.scraper import (
|
from crawl4ai.traversal import (
|
||||||
AsyncWebScraper,
|
BFSTraversalStrategy,
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
FilterChain,
|
||||||
URLPatternFilter,
|
URLPatternFilter,
|
||||||
ContentTypeFilter,
|
ContentTypeFilter,
|
||||||
@@ -123,34 +121,36 @@ async def advanced_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Initialize strategy with advanced configuration
|
# Initialize strategy with advanced configuration
|
||||||
bfs_strategy = BFSScraperStrategy(
|
bfs_strategy = BFSTraversalStrategy(
|
||||||
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create crawler and scraper
|
# Create crawler and scraper
|
||||||
async with AsyncWebScraper(
|
async with AsyncWebCrawler(
|
||||||
strategy=bfs_strategy,
|
config=browser_config,
|
||||||
crawler_config=CrawlerRunConfig(bypass_cache=True, scraping_strategy=LXMLWebScrapingStrategy(),),
|
) as crawler:
|
||||||
browser_config=browser_config,
|
|
||||||
) as scraper:
|
|
||||||
|
|
||||||
# Track statistics
|
# Track statistics
|
||||||
stats = {"processed": 0, "errors": 0, "total_size": 0}
|
stats = {"processed": 0, "errors": 0, "total_size": 0}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use streaming mode
|
# Use streaming mode
|
||||||
result_generator = await scraper.ascrape(
|
result_generator = await crawler.adeep_crawl(
|
||||||
"https://techcrunch.com", stream=True
|
"https://techcrunch.com",
|
||||||
|
strategy=bfs_strategy,
|
||||||
|
crawler_run_config=CrawlerRunConfig(
|
||||||
|
scraping_strategy=LXMLWebScrapingStrategy()
|
||||||
|
),
|
||||||
|
stream=True,
|
||||||
)
|
)
|
||||||
async for page_result in result_generator:
|
async for result in result_generator:
|
||||||
result = page_result.result
|
|
||||||
score = page_result.score
|
|
||||||
depth = page_result.depth
|
|
||||||
stats["processed"] += 1
|
stats["processed"] += 1
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
stats["total_size"] += len(result.html)
|
stats["total_size"] += len(result.html)
|
||||||
logger.info(f"Processed at depth: {depth} with score: {score:.3f} : \n {result.url}")
|
logger.info(
|
||||||
|
f"Processed at depth: {result.depth} with score: {result.score:.3f} : \n {result.url}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
stats["errors"] += 1
|
stats["errors"] += 1
|
||||||
logger.error(
|
logger.error(
|
||||||
@@ -1,187 +0,0 @@
|
|||||||
# basic_scraper_example.py
|
|
||||||
from crawl4ai.scraper import (
|
|
||||||
AsyncWebScraper,
|
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
|
||||||
URLPatternFilter,
|
|
||||||
ContentTypeFilter,
|
|
||||||
)
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
|
|
||||||
import re
|
|
||||||
|
|
||||||
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
|
||||||
|
|
||||||
|
|
||||||
async def basic_scraper_example():
|
|
||||||
"""
|
|
||||||
Basic example: Scrape a blog site for articles
|
|
||||||
- Crawls only HTML pages
|
|
||||||
- Stays within the blog section
|
|
||||||
- Collects all results at once
|
|
||||||
"""
|
|
||||||
# Create a simple filter chain
|
|
||||||
filter_chain = FilterChain(
|
|
||||||
[
|
|
||||||
# Only crawl pages within the blog section
|
|
||||||
URLPatternFilter("*/tutorial/*"),
|
|
||||||
# Only process HTML pages
|
|
||||||
ContentTypeFilter(["text/html"]),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize the strategy with basic configuration
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=2, # Only go 2 levels deep
|
|
||||||
filter_chain=filter_chain,
|
|
||||||
url_scorer=None, # Use default scoring
|
|
||||||
process_external_links=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create the crawler and scraper
|
|
||||||
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
# Start scraping
|
|
||||||
try:
|
|
||||||
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
|
|
||||||
|
|
||||||
# Process results
|
|
||||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
|
||||||
for url, data in result.extracted_data.items():
|
|
||||||
print(f"- {url}: {len(data.html)} bytes")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error during scraping: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
# advanced_scraper_example.py
|
|
||||||
import logging
|
|
||||||
from crawl4ai.scraper import (
|
|
||||||
AsyncWebScraper,
|
|
||||||
BFSScraperStrategy,
|
|
||||||
FilterChain,
|
|
||||||
URLPatternFilter,
|
|
||||||
ContentTypeFilter,
|
|
||||||
DomainFilter,
|
|
||||||
KeywordRelevanceScorer,
|
|
||||||
PathDepthScorer,
|
|
||||||
FreshnessScorer,
|
|
||||||
CompositeScorer,
|
|
||||||
)
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
||||||
|
|
||||||
|
|
||||||
async def advanced_scraper_example():
|
|
||||||
"""
|
|
||||||
Advanced example: Intelligent news site scraping
|
|
||||||
- Uses all filter types
|
|
||||||
- Implements sophisticated scoring
|
|
||||||
- Streams results
|
|
||||||
- Includes monitoring and logging
|
|
||||||
"""
|
|
||||||
# Set up logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger("advanced_scraper")
|
|
||||||
|
|
||||||
# Create sophisticated filter chain
|
|
||||||
filter_chain = FilterChain(
|
|
||||||
[
|
|
||||||
# Domain control
|
|
||||||
DomainFilter(
|
|
||||||
allowed_domains=["techcrunch.com"],
|
|
||||||
blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
|
|
||||||
),
|
|
||||||
# URL patterns
|
|
||||||
URLPatternFilter(
|
|
||||||
[
|
|
||||||
"*/article/*",
|
|
||||||
"*/news/*",
|
|
||||||
"*/blog/*",
|
|
||||||
re.compile(r"\d{4}/\d{2}/.*"), # Date-based URLs
|
|
||||||
]
|
|
||||||
),
|
|
||||||
# Content types
|
|
||||||
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create composite scorer
|
|
||||||
scorer = CompositeScorer(
|
|
||||||
[
|
|
||||||
# Prioritize by keywords
|
|
||||||
KeywordRelevanceScorer(
|
|
||||||
keywords=["news", "breaking", "update", "latest"], weight=1.0
|
|
||||||
),
|
|
||||||
# Prefer optimal URL structure
|
|
||||||
PathDepthScorer(optimal_depth=3, weight=0.7),
|
|
||||||
# Prioritize fresh content
|
|
||||||
FreshnessScorer(weight=0.9),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize strategy with advanced configuration
|
|
||||||
strategy = BFSScraperStrategy(
|
|
||||||
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create crawler and scraper
|
|
||||||
async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
|
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
|
||||||
|
|
||||||
# Track statistics
|
|
||||||
stats = {"processed": 0, "errors": 0, "total_size": 0}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Use streaming mode
|
|
||||||
result_generator = await scraper.ascrape(
|
|
||||||
"https://techcrunch.com", stream=True
|
|
||||||
)
|
|
||||||
async for result in result_generator:
|
|
||||||
stats["processed"] += 1
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
stats["total_size"] += len(result.html)
|
|
||||||
logger.info(f"Processed: {result.url}")
|
|
||||||
else:
|
|
||||||
stats["errors"] += 1
|
|
||||||
logger.error(
|
|
||||||
f"Failed to process {result.url}: {result.error_message}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Log progress regularly
|
|
||||||
if stats["processed"] % 10 == 0:
|
|
||||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Scraping error: {e}")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Print final statistics
|
|
||||||
logger.info("Scraping completed:")
|
|
||||||
logger.info(f"- URLs processed: {stats['processed']}")
|
|
||||||
logger.info(f"- Errors: {stats['errors']}")
|
|
||||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
|
||||||
|
|
||||||
# Print filter statistics
|
|
||||||
for filter_ in filter_chain.filters:
|
|
||||||
logger.info(f"{filter_.name} stats:")
|
|
||||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
|
||||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
|
||||||
|
|
||||||
# Print scorer statistics
|
|
||||||
logger.info("Scoring statistics:")
|
|
||||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
|
||||||
logger.info(
|
|
||||||
f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
# Run basic example
|
|
||||||
print("Running basic scraper example...")
|
|
||||||
asyncio.run(basic_scraper_example())
|
|
||||||
|
|
||||||
# Run advanced example
|
|
||||||
print("\nRunning advanced scraper example...")
|
|
||||||
asyncio.run(advanced_scraper_example())
|
|
||||||
Reference in New Issue
Block a user