Refactor:Moved deep_crawl_strategy, inside crawler run config
This commit is contained in:
@@ -10,6 +10,7 @@ from .config import (
|
||||
from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .deep_crawl import DeepCrawlStrategy
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
@@ -395,6 +396,7 @@ class CrawlerRunConfig:
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
deep_crawl_strategy: DeepCrawlStrategy = None,
|
||||
markdown_generator: MarkdownGenerationStrategy = None,
|
||||
content_filter : RelevantContentFilter = None,
|
||||
only_text: bool = False,
|
||||
@@ -468,6 +470,7 @@ class CrawlerRunConfig:
|
||||
self.word_count_threshold = word_count_threshold
|
||||
self.extraction_strategy = extraction_strategy
|
||||
self.chunking_strategy = chunking_strategy
|
||||
self.deep_crawl_strategy = deep_crawl_strategy
|
||||
self.markdown_generator = markdown_generator
|
||||
self.content_filter = content_filter
|
||||
self.only_text = only_text
|
||||
@@ -555,6 +558,14 @@ class CrawlerRunConfig:
|
||||
raise ValueError(
|
||||
"extraction_strategy must be an instance of ExtractionStrategy"
|
||||
)
|
||||
|
||||
if self.deep_crawl_strategy is not None and not isinstance(
|
||||
self.deep_crawl_strategy, DeepCrawlStrategy
|
||||
):
|
||||
raise ValueError(
|
||||
"deep_crawl_strategy must be an instance of DeepCrawlStrategy"
|
||||
)
|
||||
|
||||
if self.chunking_strategy is not None and not isinstance(
|
||||
self.chunking_strategy, ChunkingStrategy
|
||||
):
|
||||
@@ -573,6 +584,7 @@ class CrawlerRunConfig:
|
||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
||||
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
|
||||
markdown_generator=kwargs.get("markdown_generator"),
|
||||
content_filter=kwargs.get("content_filter"),
|
||||
only_text=kwargs.get("only_text", False),
|
||||
@@ -656,6 +668,7 @@ class CrawlerRunConfig:
|
||||
"word_count_threshold": self.word_count_threshold,
|
||||
"extraction_strategy": self.extraction_strategy,
|
||||
"chunking_strategy": self.chunking_strategy,
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"markdown_generator": self.markdown_generator,
|
||||
"content_filter": self.content_filter,
|
||||
"only_text": self.only_text,
|
||||
|
||||
@@ -38,7 +38,7 @@ from .async_logger import AsyncLogger
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
from .traversal import TraversalStrategy
|
||||
from .deep_crawl import DeepCrawlStrategy
|
||||
|
||||
from .config import MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
@@ -53,11 +53,17 @@ from .utils import (
|
||||
from typing import Union, AsyncGenerator, List, TypeVar
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
|
||||
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
|
||||
from .__version__ import __version__ as crawl4ai_version
|
||||
|
||||
CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
|
||||
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
|
||||
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
DeepCrawlManyReturn = Union[
|
||||
List[List[CrawlResultT]],
|
||||
AsyncGenerator[CrawlResultT, None],
|
||||
]
|
||||
|
||||
class AsyncWebCrawler:
|
||||
"""
|
||||
@@ -289,7 +295,7 @@ class AsyncWebCrawler:
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
) -> Union[CrawlResult, DeepCrawlSingleReturn]:
|
||||
"""
|
||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
|
||||
@@ -391,6 +397,23 @@ class AsyncWebCrawler:
|
||||
extracted_content = None
|
||||
start_time = time.perf_counter()
|
||||
|
||||
if crawler_config.deep_crawl_strategy:
|
||||
if crawler_config.stream:
|
||||
return crawler_config.deep_crawl_strategy.arun(
|
||||
start_url=url,
|
||||
crawler=self,
|
||||
crawler_run_config=crawler_config,
|
||||
)
|
||||
else:
|
||||
results = []
|
||||
async for result in crawler_config.deep_crawl_strategy.arun(
|
||||
start_url=url,
|
||||
crawler=self,
|
||||
crawler_run_config=crawler_config,
|
||||
):
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
# Try to get cached result if appropriate
|
||||
if cache_context.should_read():
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
@@ -743,7 +766,7 @@ class AsyncWebCrawler:
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
) -> RunManyReturn:
|
||||
) -> Union[RunManyReturn, DeepCrawlManyReturn]:
|
||||
"""
|
||||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||
|
||||
@@ -830,7 +853,7 @@ class AsyncWebCrawler:
|
||||
async def adeep_crawl(
|
||||
self,
|
||||
url: str,
|
||||
strategy: TraversalStrategy,
|
||||
strategy: DeepCrawlStrategy,
|
||||
crawler_run_config: Optional[CrawlerRunConfig] = None,
|
||||
stream: Optional[bool] = False,
|
||||
) -> Union[AsyncGenerator[CrawlResult,None],List[CrawlResult]]:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from .bfs_traversal_strategy import BFSTraversalStrategy
|
||||
from .bfs_deep_crawl_strategy import BFSDeepCrawlStrategy
|
||||
from .filters import (
|
||||
URLFilter,
|
||||
FilterChain,
|
||||
@@ -12,10 +12,10 @@ from .scorers import (
|
||||
FreshnessScorer,
|
||||
CompositeScorer,
|
||||
)
|
||||
from .traversal_strategy import TraversalStrategy
|
||||
from .deep_crawl_strategty import DeepCrawlStrategy
|
||||
|
||||
__all__ = [
|
||||
"BFSTraversalStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
"FilterChain",
|
||||
"URLFilter",
|
||||
"URLPatternFilter",
|
||||
@@ -25,5 +25,5 @@ __all__ = [
|
||||
"PathDepthScorer",
|
||||
"FreshnessScorer",
|
||||
"CompositeScorer",
|
||||
"TraversalStrategy",
|
||||
"DeepCrawlStrategy",
|
||||
]
|
||||
@@ -3,15 +3,14 @@ from datetime import datetime
|
||||
import asyncio
|
||||
import logging
|
||||
from urllib.parse import urlparse
|
||||
from ..async_configs import CrawlerRunConfig
|
||||
from ..models import CrawlResult, TraversalStats
|
||||
from .filters import FilterChain
|
||||
from .scorers import URLScorer
|
||||
from .traversal_strategy import TraversalStrategy
|
||||
from .deep_crawl_strategty import DeepCrawlStrategy
|
||||
from ..config import DEEP_CRAWL_BATCH_SIZE
|
||||
|
||||
|
||||
class BFSTraversalStrategy(TraversalStrategy):
|
||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
"""Best-First Search traversal strategy with filtering and scoring."""
|
||||
|
||||
def __init__(
|
||||
@@ -98,11 +97,11 @@ class BFSTraversalStrategy(TraversalStrategy):
|
||||
self.stats.total_depth_reached, next_depth
|
||||
)
|
||||
|
||||
async def deep_crawl(
|
||||
async def arun(
|
||||
self,
|
||||
start_url: str,
|
||||
crawler: "AsyncWebCrawler",
|
||||
crawler_run_config: Optional[CrawlerRunConfig] = None,
|
||||
crawler_run_config: Optional["CrawlerRunConfig"] = None,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
"""Implement BFS traversal strategy"""
|
||||
|
||||
@@ -136,7 +135,9 @@ class BFSTraversalStrategy(TraversalStrategy):
|
||||
"""
|
||||
# Collect batch of URLs into active_crawls to process
|
||||
async with active_crawls_lock:
|
||||
while len(active_crawls) < DEEP_CRAWL_BATCH_SIZE and not queue.empty():
|
||||
while (
|
||||
len(active_crawls) < DEEP_CRAWL_BATCH_SIZE and not queue.empty()
|
||||
):
|
||||
score, depth, url, parent_url = await queue.get()
|
||||
active_crawls[url] = {
|
||||
"depth": depth,
|
||||
@@ -151,14 +152,14 @@ class BFSTraversalStrategy(TraversalStrategy):
|
||||
continue
|
||||
# Process batch
|
||||
try:
|
||||
stream_config = (
|
||||
crawler_run_config.clone(stream=True)
|
||||
if crawler_run_config
|
||||
else CrawlerRunConfig(stream=True)
|
||||
)
|
||||
# This is very important to ensure recursively you don't deep_crawl down the children.
|
||||
if crawler_run_config:
|
||||
crawler_run_config = crawler_run_config.clone(
|
||||
deep_crawl_strategy=None, stream=True
|
||||
)
|
||||
async for result in await crawler.arun_many(
|
||||
urls=list(active_crawls.keys()),
|
||||
config=stream_config,
|
||||
config=crawler_run_config
|
||||
):
|
||||
async with active_crawls_lock:
|
||||
crawl_info = active_crawls.pop(result.url, None)
|
||||
@@ -1,17 +1,16 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import AsyncGenerator
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
from ..async_configs import CrawlerRunConfig
|
||||
from ..models import CrawlResult
|
||||
|
||||
|
||||
class TraversalStrategy(ABC):
|
||||
class DeepCrawlStrategy(ABC):
|
||||
@abstractmethod
|
||||
async def deep_crawl(
|
||||
async def arun(
|
||||
self,
|
||||
url: str,
|
||||
crawler: "AsyncWebCrawler",
|
||||
crawler_run_config: CrawlerRunConfig = None,
|
||||
crawler_run_config: Optional["CrawlerRunConfig"] = None,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
"""Traverse the given URL using the specified crawler.
|
||||
|
||||
@@ -140,7 +140,6 @@ class CrawlResult(BaseModel):
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
|
||||
@@ -1,18 +1,25 @@
|
||||
# basic_scraper_example.py
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
from crawl4ai.traversal import (
|
||||
BFSTraversalStrategy,
|
||||
from crawl4ai.deep_crawl import (
|
||||
BFSDeepCrawlStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
FreshnessScorer,
|
||||
CompositeScorer,
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
|
||||
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
||||
|
||||
|
||||
async def basic_scraper_example():
|
||||
"""
|
||||
Basic example: Scrape a blog site for articles
|
||||
@@ -31,7 +38,7 @@ async def basic_scraper_example():
|
||||
)
|
||||
|
||||
# Initialize the strategy with basic configuration
|
||||
bfs_strategy = BFSTraversalStrategy(
|
||||
bfs_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Only go 2 levels deep
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None, # Use default scoring
|
||||
@@ -44,8 +51,8 @@ async def basic_scraper_example():
|
||||
) as crawler:
|
||||
# Start scraping
|
||||
try:
|
||||
results = await crawler.adeep_crawl(
|
||||
"https://crawl4ai.com/mkdocs", strategy=bfs_strategy
|
||||
results = await crawler.arun(
|
||||
"https://crawl4ai.com/mkdocs", CrawlerRunConfig(deep_crawl_strategy=bfs_strategy)
|
||||
)
|
||||
# Process results
|
||||
print(f"Crawled {len(results)} pages:")
|
||||
@@ -55,23 +62,6 @@ async def basic_scraper_example():
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
|
||||
# advanced_scraper_example.py
|
||||
import logging
|
||||
|
||||
from crawl4ai.traversal import (
|
||||
BFSTraversalStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
FreshnessScorer,
|
||||
CompositeScorer,
|
||||
)
|
||||
|
||||
|
||||
async def advanced_scraper_example():
|
||||
"""
|
||||
Advanced example: Intelligent news site scraping
|
||||
@@ -121,7 +111,7 @@ async def advanced_scraper_example():
|
||||
)
|
||||
|
||||
# Initialize strategy with advanced configuration
|
||||
bfs_strategy = BFSTraversalStrategy(
|
||||
bfs_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
||||
)
|
||||
|
||||
@@ -136,13 +126,10 @@ async def advanced_scraper_example():
|
||||
try:
|
||||
# Use streaming mode
|
||||
results = []
|
||||
result_generator = await crawler.adeep_crawl(
|
||||
result_generator = await crawler.arun(
|
||||
"https://techcrunch.com",
|
||||
strategy=bfs_strategy,
|
||||
crawler_run_config=CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
),
|
||||
stream=True,
|
||||
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,
|
||||
stream=True)
|
||||
)
|
||||
async for result in result_generator:
|
||||
stats["processed"] += 1
|
||||
|
||||
Reference in New Issue
Block a user