From 91073c1244b28d31d2566b15d4c2b0fcae856b5f Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 7 Feb 2025 19:01:59 +0800 Subject: [PATCH] refactor(crawling): improve type hints and code cleanup - Added proper return type hints for DeepCrawlStrategy.arun method - Added __call__ method to DeepCrawlStrategy for easier usage - Removed redundant comments and imports - Cleaned up type hints in DFS strategy - Removed empty docker_client.py and .continuerules - Added .private/ to gitignore BREAKING CHANGE: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] --- .continuerules | 4 ---- .gitignore | 3 ++- crawl4ai/deep_crawling/base_strategy.py | 5 ++++- crawl4ai/deep_crawling/bfs_strategy.py | 3 --- crawl4ai/deep_crawling/dfs_strategy.py | 17 ++++++----------- crawl4ai/deep_crawling/scorers.py | 2 -- docker_client.py | 0 7 files changed, 12 insertions(+), 22 deletions(-) delete mode 100644 .continuerules delete mode 100644 docker_client.py diff --git a/.continuerules b/.continuerules deleted file mode 100644 index d16f5b48..00000000 --- a/.continuerules +++ /dev/null @@ -1,4 +0,0 @@ -- This project Crawl4ai, I am very sensitive to any change, make sure to follow my instruction closely. -- Never apply changes I never asked, you are not 100$ free agent to do whatever you want. -- All changes must be relevant to the requests I asked for. -- Do not jumpt tp make changes first share your plane and explain to me what you wanna do. \ No newline at end of file diff --git a/.gitignore b/.gitignore index e10e3e88..68e715ef 100644 --- a/.gitignore +++ b/.gitignore @@ -250,4 +250,5 @@ continue_config.json .prompts/ -.llm.env \ No newline at end of file +.llm.env +.private/ \ No newline at end of file diff --git a/crawl4ai/deep_crawling/base_strategy.py b/crawl4ai/deep_crawling/base_strategy.py index b7564bcd..222338a3 100644 --- a/crawl4ai/deep_crawling/base_strategy.py +++ b/crawl4ai/deep_crawling/base_strategy.py @@ -94,7 +94,7 @@ class DeepCrawlStrategy(ABC): crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration. Returns: - AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results. + Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] """ if config is None: raise ValueError("CrawlerRunConfig must be provided") @@ -104,6 +104,9 @@ class DeepCrawlStrategy(ABC): else: return await self._arun_batch(start_url, crawler, config) + def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig): + return self.arun(start_url, crawler, config) + @abstractmethod async def shutdown(self) -> None: """ diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index fd2a7aae..c8b700c4 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -131,7 +131,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): depth = depths.get(url, 0) result.metadata = result.metadata or {} result.metadata["depth"] = depth - # Retrieve parent_url from current_level. parent_url = next((parent for (u, parent) in current_level if u == url), None) result.metadata["parent_url"] = parent_url results.append(result) @@ -174,8 +173,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): current_level = next_level - - async def shutdown(self) -> None: """ Clean up resources and signal cancellation of the crawl. diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py index 20a6e466..423315f0 100644 --- a/crawl4ai/deep_crawling/dfs_strategy.py +++ b/crawl4ai/deep_crawling/dfs_strategy.py @@ -2,13 +2,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple from ..models import CrawlResult -from .bfs_strategy import BFSDeepCrawlStrategy # Inherit common logic: can_process_url, link_discovery, etc. - -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from ..async_configs import CrawlerRunConfig - from ..async_webcrawler import AsyncWebCrawler - +from .bfs_strategy import BFSDeepCrawlStrategy # noqa +from ..types import AsyncWebCrawler, CrawlerRunConfig class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): """ @@ -20,8 +15,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): async def _arun_batch( self, start_url: str, - crawler: "AsyncWebCrawler", - config: "CrawlerRunConfig", + crawler: AsyncWebCrawler, + config: CrawlerRunConfig, ) -> List[CrawlResult]: """ Batch (non-streaming) DFS mode. @@ -61,8 +56,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): async def _arun_stream( self, start_url: str, - crawler: "AsyncWebCrawler", - config: "CrawlerRunConfig", + crawler: AsyncWebCrawler, + config: CrawlerRunConfig, ) -> AsyncGenerator[CrawlResult, None]: """ Streaming DFS mode. diff --git a/crawl4ai/deep_crawling/scorers.py b/crawl4ai/deep_crawling/scorers.py index 1c50b51a..53fdd6aa 100644 --- a/crawl4ai/deep_crawling/scorers.py +++ b/crawl4ai/deep_crawling/scorers.py @@ -3,11 +3,9 @@ from typing import List, Dict, Optional from dataclasses import dataclass from urllib.parse import urlparse, unquote import re -import math import logging from functools import lru_cache from array import array -from functools import lru_cache import ctypes import platform PLATFORM = platform.system() diff --git a/docker_client.py b/docker_client.py deleted file mode 100644 index e69de29b..00000000