refactor(crawling): improve type hints and code cleanup

- Added proper return type hints for DeepCrawlStrategy.arun method
- Added __call__ method to DeepCrawlStrategy for easier usage
- Removed redundant comments and imports
- Cleaned up type hints in DFS strategy
- Removed empty docker_client.py and .continuerules
- Added .private/ to gitignore

BREAKING CHANGE: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
This commit is contained in:
UncleCode
2025-02-07 19:01:59 +08:00
parent 926beee832
commit 91073c1244
7 changed files with 12 additions and 22 deletions

View File

@@ -1,4 +0,0 @@
- This project Crawl4ai, I am very sensitive to any change, make sure to follow my instruction closely.
- Never apply changes I never asked, you are not 100$ free agent to do whatever you want.
- All changes must be relevant to the requests I asked for.
- Do not jumpt tp make changes first share your plane and explain to me what you wanna do.

3
.gitignore vendored
View File

@@ -250,4 +250,5 @@ continue_config.json
.prompts/ .prompts/
.llm.env .llm.env
.private/

View File

@@ -94,7 +94,7 @@ class DeepCrawlStrategy(ABC):
crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration. crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
Returns: Returns:
AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results. Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
""" """
if config is None: if config is None:
raise ValueError("CrawlerRunConfig must be provided") raise ValueError("CrawlerRunConfig must be provided")
@@ -104,6 +104,9 @@ class DeepCrawlStrategy(ABC):
else: else:
return await self._arun_batch(start_url, crawler, config) return await self._arun_batch(start_url, crawler, config)
def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
return self.arun(start_url, crawler, config)
@abstractmethod @abstractmethod
async def shutdown(self) -> None: async def shutdown(self) -> None:
""" """

View File

@@ -131,7 +131,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
depth = depths.get(url, 0) depth = depths.get(url, 0)
result.metadata = result.metadata or {} result.metadata = result.metadata or {}
result.metadata["depth"] = depth result.metadata["depth"] = depth
# Retrieve parent_url from current_level.
parent_url = next((parent for (u, parent) in current_level if u == url), None) parent_url = next((parent for (u, parent) in current_level if u == url), None)
result.metadata["parent_url"] = parent_url result.metadata["parent_url"] = parent_url
results.append(result) results.append(result)
@@ -174,8 +173,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
current_level = next_level current_level = next_level
async def shutdown(self) -> None: async def shutdown(self) -> None:
""" """
Clean up resources and signal cancellation of the crawl. Clean up resources and signal cancellation of the crawl.

View File

@@ -2,13 +2,8 @@
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from ..models import CrawlResult from ..models import CrawlResult
from .bfs_strategy import BFSDeepCrawlStrategy # Inherit common logic: can_process_url, link_discovery, etc. from .bfs_strategy import BFSDeepCrawlStrategy # noqa
from ..types import AsyncWebCrawler, CrawlerRunConfig
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ..async_configs import CrawlerRunConfig
from ..async_webcrawler import AsyncWebCrawler
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
""" """
@@ -20,8 +15,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
async def _arun_batch( async def _arun_batch(
self, self,
start_url: str, start_url: str,
crawler: "AsyncWebCrawler", crawler: AsyncWebCrawler,
config: "CrawlerRunConfig", config: CrawlerRunConfig,
) -> List[CrawlResult]: ) -> List[CrawlResult]:
""" """
Batch (non-streaming) DFS mode. Batch (non-streaming) DFS mode.
@@ -61,8 +56,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
async def _arun_stream( async def _arun_stream(
self, self,
start_url: str, start_url: str,
crawler: "AsyncWebCrawler", crawler: AsyncWebCrawler,
config: "CrawlerRunConfig", config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]: ) -> AsyncGenerator[CrawlResult, None]:
""" """
Streaming DFS mode. Streaming DFS mode.

View File

@@ -3,11 +3,9 @@ from typing import List, Dict, Optional
from dataclasses import dataclass from dataclasses import dataclass
from urllib.parse import urlparse, unquote from urllib.parse import urlparse, unquote
import re import re
import math
import logging import logging
from functools import lru_cache from functools import lru_cache
from array import array from array import array
from functools import lru_cache
import ctypes import ctypes
import platform import platform
PLATFORM = platform.system() PLATFORM = platform.system()

View File