diff --git a/.continuerules b/.continuerules deleted file mode 100644 index d16f5b48..00000000 --- a/.continuerules +++ /dev/null @@ -1,4 +0,0 @@ -- This project Crawl4ai, I am very sensitive to any change, make sure to follow my instruction closely. -- Never apply changes I never asked, you are not 100$ free agent to do whatever you want. -- All changes must be relevant to the requests I asked for. -- Do not jumpt tp make changes first share your plane and explain to me what you wanna do. \ No newline at end of file diff --git a/.gitignore b/.gitignore index e10e3e88..68e715ef 100644 --- a/.gitignore +++ b/.gitignore @@ -250,4 +250,5 @@ continue_config.json .prompts/ -.llm.env \ No newline at end of file +.llm.env +.private/ \ No newline at end of file diff --git a/crawl4ai/deep_crawling/base_strategy.py b/crawl4ai/deep_crawling/base_strategy.py index b7564bcd..222338a3 100644 --- a/crawl4ai/deep_crawling/base_strategy.py +++ b/crawl4ai/deep_crawling/base_strategy.py @@ -94,7 +94,7 @@ class DeepCrawlStrategy(ABC): crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration. Returns: - AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results. + Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] """ if config is None: raise ValueError("CrawlerRunConfig must be provided") @@ -104,6 +104,9 @@ class DeepCrawlStrategy(ABC): else: return await self._arun_batch(start_url, crawler, config) + def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig): + return self.arun(start_url, crawler, config) + @abstractmethod async def shutdown(self) -> None: """ diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index fd2a7aae..c8b700c4 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -131,7 +131,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): depth = depths.get(url, 0) result.metadata = result.metadata or {} result.metadata["depth"] = depth - # Retrieve parent_url from current_level. parent_url = next((parent for (u, parent) in current_level if u == url), None) result.metadata["parent_url"] = parent_url results.append(result) @@ -174,8 +173,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): current_level = next_level - - async def shutdown(self) -> None: """ Clean up resources and signal cancellation of the crawl. diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py index 20a6e466..423315f0 100644 --- a/crawl4ai/deep_crawling/dfs_strategy.py +++ b/crawl4ai/deep_crawling/dfs_strategy.py @@ -2,13 +2,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple from ..models import CrawlResult -from .bfs_strategy import BFSDeepCrawlStrategy # Inherit common logic: can_process_url, link_discovery, etc. - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from ..async_configs import CrawlerRunConfig - from ..async_webcrawler import AsyncWebCrawler - +from .bfs_strategy import BFSDeepCrawlStrategy # noqa +from ..types import AsyncWebCrawler, CrawlerRunConfig class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): """ @@ -20,8 +15,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): async def _arun_batch( self, start_url: str, - crawler: "AsyncWebCrawler", - config: "CrawlerRunConfig", + crawler: AsyncWebCrawler, + config: CrawlerRunConfig, ) -> List[CrawlResult]: """ Batch (non-streaming) DFS mode. @@ -61,8 +56,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): async def _arun_stream( self, start_url: str, - crawler: "AsyncWebCrawler", - config: "CrawlerRunConfig", + crawler: AsyncWebCrawler, + config: CrawlerRunConfig, ) -> AsyncGenerator[CrawlResult, None]: """ Streaming DFS mode. diff --git a/crawl4ai/deep_crawling/scorers.py b/crawl4ai/deep_crawling/scorers.py index 1c50b51a..53fdd6aa 100644 --- a/crawl4ai/deep_crawling/scorers.py +++ b/crawl4ai/deep_crawling/scorers.py @@ -3,11 +3,9 @@ from typing import List, Dict, Optional from dataclasses import dataclass from urllib.parse import urlparse, unquote import re -import math import logging from functools import lru_cache from array import array -from functools import lru_cache import ctypes import platform PLATFORM = platform.system() diff --git a/docker_client.py b/docker_client.py deleted file mode 100644 index e69de29b..00000000