refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure:
- Moved legacy synchronous crawler code to legacy folder
- Removed deprecated CLI and docs manager
- Consolidated version manager into utils.py
- Added CrawlerHub to __init__.py exports
- Fixed type hints in async_webcrawler.py
- Fixed minor bugs in chunking and crawler strategies

BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
This commit is contained in:
UncleCode
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions

View File

@@ -49,6 +49,12 @@ from collections.abc import AsyncGenerator
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
DeepCrawlManyReturn = Union[
List[List[CrawlResultT]],
AsyncGenerator[CrawlResultT, None],
]
from .__version__ import __version__ as crawl4ai_version
@@ -282,7 +288,7 @@ class AsyncWebCrawler:
user_agent: str = None,
verbose=True,
**kwargs,
) -> CrawlResult:
) -> Union[CrawlResult, DeepCrawlSingleReturn]:
"""
Runs the crawler for a single source: URL (web, local file, or raw HTML).
@@ -709,7 +715,7 @@ class AsyncWebCrawler:
user_agent: str = None,
verbose=True,
**kwargs
) -> RunManyReturn:
) -> Union[RunManyReturn, DeepCrawlManyReturn]:
"""
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.