refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure:
- Moved legacy synchronous crawler code to legacy folder
- Removed deprecated CLI and docs manager
- Consolidated version manager into utils.py
- Added CrawlerHub to __init__.py exports
- Fixed type hints in async_webcrawler.py
- Fixed minor bugs in chunking and crawler strategies

BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
This commit is contained in:
UncleCode
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions

View File

@@ -0,0 +1,17 @@
# example_usageexample_usageexample_usage# example_usage.py
import asyncio
from crawl4ai.crawlers import get_crawler
async def main():
# Get the registered crawler
example_crawler = get_crawler("example_site.content")
# Crawl example.com
result = await example_crawler(url="https://example.com")
print(result)
if __name__ == "__main__":
asyncio.run(main())

30
tests/hub/test_simple.py Normal file
View File

@@ -0,0 +1,30 @@
# test.py
from crawl4ai import CrawlerHub
import json
async def amazon_example():
if (crawler_cls := CrawlerHub.get("amazon_product")) :
crawler = crawler_cls()
print(f"Crawler version: {crawler_cls.meta['version']}")
print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
print(await crawler.run("https://amazon.com/test"))
else:
print("Crawler not found!")
async def google_example():
# Get crawler dynamically
crawler_cls = CrawlerHub.get("google_search")
crawler = crawler_cls()
# Text search
text_results = await crawler.run(query="apple inc", search_type="text", schema_cache_path="/Users/unclecode/.crawl4ai")
print(json.loads(text_results))
# Image search
image_results = await crawler.run(query="apple inc", search_type="image")
print(image_results)
if __name__ == "__main__":
import asyncio
# asyncio.run(amazon_example())
asyncio.run(google_example())