refactor(crawling): improve type hints and code cleanup

- Added proper return type hints for DeepCrawlStrategy.arun method
- Added __call__ method to DeepCrawlStrategy for easier usage
- Removed redundant comments and imports
- Cleaned up type hints in DFS strategy
- Removed empty docker_client.py and .continuerules
- Added .private/ to gitignore

BREAKING CHANGE: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
This commit is contained in:
UncleCode
2025-02-07 19:01:59 +08:00
parent 926beee832
commit 91073c1244
7 changed files with 12 additions and 22 deletions

View File

@@ -1,4 +0,0 @@
- This project Crawl4ai, I am very sensitive to any change, make sure to follow my instruction closely.
- Never apply changes I never asked, you are not 100$ free agent to do whatever you want.
- All changes must be relevant to the requests I asked for.
- Do not jumpt tp make changes first share your plane and explain to me what you wanna do.

3
.gitignore vendored
View File

@@ -250,4 +250,5 @@ continue_config.json
.prompts/
.llm.env
.llm.env
.private/

View File

@@ -94,7 +94,7 @@ class DeepCrawlStrategy(ABC):
crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
Returns:
AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results.
Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
"""
if config is None:
raise ValueError("CrawlerRunConfig must be provided")
@@ -104,6 +104,9 @@ class DeepCrawlStrategy(ABC):
else:
return await self._arun_batch(start_url, crawler, config)
def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
return self.arun(start_url, crawler, config)
@abstractmethod
async def shutdown(self) -> None:
"""

View File

@@ -131,7 +131,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
depth = depths.get(url, 0)
result.metadata = result.metadata or {}
result.metadata["depth"] = depth
# Retrieve parent_url from current_level.
parent_url = next((parent for (u, parent) in current_level if u == url), None)
result.metadata["parent_url"] = parent_url
results.append(result)
@@ -174,8 +173,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
current_level = next_level
async def shutdown(self) -> None:
"""
Clean up resources and signal cancellation of the crawl.

View File

@@ -2,13 +2,8 @@
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from ..models import CrawlResult
from .bfs_strategy import BFSDeepCrawlStrategy # Inherit common logic: can_process_url, link_discovery, etc.
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from ..async_configs import CrawlerRunConfig
from ..async_webcrawler import AsyncWebCrawler
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
from ..types import AsyncWebCrawler, CrawlerRunConfig
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
"""
@@ -20,8 +15,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
async def _arun_batch(
self,
start_url: str,
crawler: "AsyncWebCrawler",
config: "CrawlerRunConfig",
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> List[CrawlResult]:
"""
Batch (non-streaming) DFS mode.
@@ -61,8 +56,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
async def _arun_stream(
self,
start_url: str,
crawler: "AsyncWebCrawler",
config: "CrawlerRunConfig",
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]:
"""
Streaming DFS mode.

View File

@@ -3,11 +3,9 @@ from typing import List, Dict, Optional
from dataclasses import dataclass
from urllib.parse import urlparse, unquote
import re
import math
import logging
from functools import lru_cache
from array import array
from functools import lru_cache
import ctypes
import platform
PLATFORM = platform.system()

View File