refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure:
- Moved legacy synchronous crawler code to legacy folder
- Removed deprecated CLI and docs manager
- Consolidated version manager into utils.py
- Added CrawlerHub to __init__.py exports
- Fixed type hints in async_webcrawler.py
- Fixed minor bugs in chunking and crawler strategies

BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
This commit is contained in:
UncleCode
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions

View File

@@ -28,6 +28,35 @@ import hashlib
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
import aiohttp
from pathlib import Path
from packaging import version
from . import __version__
class VersionManager:
def __init__(self):
self.home_dir = Path.home() / ".crawl4ai"
self.version_file = self.home_dir / "version.txt"
def get_installed_version(self):
"""Get the version recorded in home directory"""
if not self.version_file.exists():
return None
try:
return version.parse(self.version_file.read_text().strip())
except:
return None
def update_version(self):
"""Update the version file to current library version"""
self.version_file.write_text(__version__.__version__)
def needs_update(self):
"""Check if database needs update based on version"""
installed = self.get_installed_version()
current = version.parse(__version__.__version__)
return installed is None or installed < current
class RobotsParser:
# Default 7 days cache TTL