refactor(crawling): improve type hints and code cleanup
- Added proper return type hints for DeepCrawlStrategy.arun method - Added __call__ method to DeepCrawlStrategy for easier usage - Removed redundant comments and imports - Cleaned up type hints in DFS strategy - Removed empty docker_client.py and .continuerules - Added .private/ to gitignore BREAKING CHANGE: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
This commit is contained in:
@@ -1,4 +0,0 @@
|
||||
- This project Crawl4ai, I am very sensitive to any change, make sure to follow my instruction closely.
|
||||
- Never apply changes I never asked, you are not 100$ free agent to do whatever you want.
|
||||
- All changes must be relevant to the requests I asked for.
|
||||
- Do not jumpt tp make changes first share your plane and explain to me what you wanna do.
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -251,3 +251,4 @@ continue_config.json
|
||||
.prompts/
|
||||
|
||||
.llm.env
|
||||
.private/
|
||||
@@ -94,7 +94,7 @@ class DeepCrawlStrategy(ABC):
|
||||
crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
|
||||
|
||||
Returns:
|
||||
AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results.
|
||||
Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
"""
|
||||
if config is None:
|
||||
raise ValueError("CrawlerRunConfig must be provided")
|
||||
@@ -104,6 +104,9 @@ class DeepCrawlStrategy(ABC):
|
||||
else:
|
||||
return await self._arun_batch(start_url, crawler, config)
|
||||
|
||||
def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
|
||||
return self.arun(start_url, crawler, config)
|
||||
|
||||
@abstractmethod
|
||||
async def shutdown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -131,7 +131,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
depth = depths.get(url, 0)
|
||||
result.metadata = result.metadata or {}
|
||||
result.metadata["depth"] = depth
|
||||
# Retrieve parent_url from current_level.
|
||||
parent_url = next((parent for (u, parent) in current_level if u == url), None)
|
||||
result.metadata["parent_url"] = parent_url
|
||||
results.append(result)
|
||||
@@ -174,8 +173,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
|
||||
current_level = next_level
|
||||
|
||||
|
||||
|
||||
async def shutdown(self) -> None:
|
||||
"""
|
||||
Clean up resources and signal cancellation of the crawl.
|
||||
|
||||
@@ -2,13 +2,8 @@
|
||||
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
|
||||
|
||||
from ..models import CrawlResult
|
||||
from .bfs_strategy import BFSDeepCrawlStrategy # Inherit common logic: can_process_url, link_discovery, etc.
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
if TYPE_CHECKING:
|
||||
from ..async_configs import CrawlerRunConfig
|
||||
from ..async_webcrawler import AsyncWebCrawler
|
||||
|
||||
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
"""
|
||||
@@ -20,8 +15,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
async def _arun_batch(
|
||||
self,
|
||||
start_url: str,
|
||||
crawler: "AsyncWebCrawler",
|
||||
config: "CrawlerRunConfig",
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> List[CrawlResult]:
|
||||
"""
|
||||
Batch (non-streaming) DFS mode.
|
||||
@@ -61,8 +56,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
async def _arun_stream(
|
||||
self,
|
||||
start_url: str,
|
||||
crawler: "AsyncWebCrawler",
|
||||
config: "CrawlerRunConfig",
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
"""
|
||||
Streaming DFS mode.
|
||||
|
||||
@@ -3,11 +3,9 @@ from typing import List, Dict, Optional
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urlparse, unquote
|
||||
import re
|
||||
import math
|
||||
import logging
|
||||
from functools import lru_cache
|
||||
from array import array
|
||||
from functools import lru_cache
|
||||
import ctypes
|
||||
import platform
|
||||
PLATFORM = platform.system()
|
||||
|
||||
Reference in New Issue
Block a user