refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -14,6 +14,8 @@ from .extraction_strategy import (
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
 from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
@@ -26,10 +28,12 @@ from .async_dispatcher import (
    DisplayMode,
    BaseDispatcher
 )
 from .hub import CrawlerHub
 __all__ = [
    "AsyncWebCrawler",
    "CrawlResult",
    "CrawlerHub",
    "CacheMode",
    "ContentScrapingStrategy",
    "WebScrapingStrategy",
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1265,6 +1265,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        """
        config.url = url
        response_headers = {}
        execution_result = None
        status_code = None
        redirected_url = url 
@@ -1522,6 +1523,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                execution_result = await self.robust_execute_user_script(
                    page, config.js_code
                )
                if not execution_result["success"]:
                    self.logger.warning(
                        message="User script execution had issues: {error}",
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -9,7 +9,7 @@ import json  # Added for serialization/deserialization
 from .utils import ensure_content_dirs, generate_content_hash
 from .models import CrawlResult, MarkdownGenerationResult
 import aiofiles
-from .version_manager import VersionManager
+from .utils import VersionManager
 from .async_logger import AsyncLogger
 from .utils import get_error_context, create_box_message
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -49,6 +49,12 @@ from collections.abc import AsyncGenerator
 CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
 RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
 DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
 DeepCrawlManyReturn = Union[
    List[List[CrawlResultT]],
    AsyncGenerator[CrawlResultT, None],
 ]
 from .__version__ import __version__ as crawl4ai_version
@@ -282,7 +288,7 @@ class AsyncWebCrawler:
        user_agent: str = None,
        verbose=True,
        **kwargs,
-    ) -> CrawlResult:
+    ) -> Union[CrawlResult, DeepCrawlSingleReturn]:
        """
        Runs the crawler for a single source: URL (web, local file, or raw HTML).
@@ -709,7 +715,7 @@ class AsyncWebCrawler:
        user_agent: str = None,
        verbose=True,
        **kwargs
-        ) -> RunManyReturn:
+        ) -> Union[RunManyReturn, DeepCrawlManyReturn]:
        """
        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -4,7 +4,6 @@ from collections import Counter
 import string
 from .model_loader import load_nltk_punkt
 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
    """
@@ -72,6 +71,7 @@ class NlpSentenceChunking(ChunkingStrategy):
        """
        Initialize the NlpSentenceChunking object.
        """
        from crawl4ai.le.legacy.model_loader import load_nltk_punkt
        load_nltk_punkt()
    def chunk(self, text: str) -> list:
--- a/crawl4ai/crawlers/init.py
+++ b/crawl4ai/crawlers/init.py
--- a/crawl4ai/crawlers/amazon_product/init.py
+++ b/crawl4ai/crawlers/amazon_product/init.py
--- a/crawl4ai/crawlers/amazon_product/crawler.py
+++ b/crawl4ai/crawlers/amazon_product/crawler.py
@@ -0,0 +1,20 @@
 from crawl4ai.hub import BaseCrawler
 __meta__ = {
    "version": "1.2.0",
    "tested_on": ["amazon.com"],
    "rate_limit": "50 RPM",
    "schema": {"product": ["name", "price"]}
 }
 class AmazonProductCrawler(BaseCrawler):
    async def run(self, url: str, **kwargs) -> str:
        try:
            self.logger.info(f"Crawling {url}")
            return '{"product": {"name": "Test Amazon Product"}}'
        except Exception as e:
            self.logger.error(f"Crawl failed: {str(e)}")
            return json.dumps({
                "error": str(e),
                "metadata": self.meta  # Include meta in error response
            })            
--- a/crawl4ai/crawlers/google_search/init.py
+++ b/crawl4ai/crawlers/google_search/init.py
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -0,0 +1,125 @@
 from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
 from crawl4ai.hub import BaseCrawler
 from crawl4ai.utils import optimize_html, get_home_folder
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from pathlib import Path
 import json
 import os
 import asyncio
 from typing import Dict, Any
 class GoogleSearchCrawler(BaseCrawler):
    __meta__ = {
        "version": "1.0.0",
        "tested_on": ["google.com/search*"],
        "rate_limit": "10 RPM",
        "description": "Crawls Google Search results (text + images)",
    }
    def __init__(self):
        super().__init__()
        self.js_script = (Path(__file__).parent /
                          "script.js").read_text()
    async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str:
        """Crawl Google Search results for a query"""
        url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2"
        browser_config = BrowserConfig(headless=True, verbose=True)
        async with AsyncWebCrawler(config=browser_config) as crawler:
            config = CrawlerRunConfig(
                cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
                delay_before_return_html=kwargs.get(
                    "delay", 2 if search_type == "image" else 1),
                js_code=self.js_script if search_type == "image" else None,
            )
            result = await crawler.arun(url=url, config=config)
            if not result.success:
                return json.dumps({"error": result.error})
            if search_type == "image":
                if result.js_execution_result.get("success", False) is False:
                    return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")})
                if "results" in result.js_execution_result:
                    image_result = result.js_execution_result['results'][0]
                    if image_result.get("success", False) is False:
                        return json.dumps({"error": image_result.get("error", "Unknown error")})
                    return json.dumps(image_result["result"], indent=4)
            # For text search, extract structured data
            schemas = await self._build_schemas(result.cleaned_html, schema_cache_path)
            extracted = {
                key: JsonCssExtractionStrategy(schema=schemas[key]).run(
                    url=url, sections=[result.html]
                )
                for key in schemas
            }
            return json.dumps(extracted, indent=4)
    async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]:
        """Build extraction schemas (organic, top stories, etc.)"""
        home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
        os.makedirs(f"{home_dir}/schema", exist_ok=True)
        cleaned_html = optimize_html(html, threshold=100)
        organic_schema = None
        if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
            with open(f"{home_dir}/schema/organic_schema.json", "r") as f:
                organic_schema = json.load(f)
        else:
            organic_schema = JsonCssExtractionStrategy.generate_schema(
                html=_html,
                target_json_example="""{
            "title": "...",
            "link": "...",
            "snippet": "...",
            "date": "1 hour ago",
        }""",
                query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date."""
            )
            with open(f"{home_dir}/schema/organic_schema.json", "w") as f:
                f.write(json.dumps(organic_schema))
        top_stories_schema = None
        if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"):
            with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f:
                top_stories_schema = json.load(f)
        else:
            top_stories_schema = JsonCssExtractionStrategy.generate_schema(
                html=_html,
                target_json_example="""{
            "title": "...",
            "link": "...",
            "source": "Insider Monkey",
            "date": "1 hour ago",
            "imageUrl": "..."
        }""",
                query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
            )
            with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f:
                f.write(json.dumps(top_stories_schema))
        suggested_query_schema = None
        if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"):
            with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f:
                suggested_query_schema = json.load(f)
        else:
            suggested_query_schema = JsonCssExtractionStrategy.generate_schema(
                html=_html,
                target_json_example="""{
            "query": "A for Apple",
        }""",
                query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only."""
            )
            with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f:
                f.write(json.dumps(suggested_query_schema))
        return {
            "organic_schema": organic_schema,
            "top_stories_schema": top_stories_schema,
            "suggested_query_schema": suggested_query_schema,
        }
--- a/crawl4ai/crawlers/google_search/script.js
+++ b/crawl4ai/crawlers/google_search/script.js
@@ -0,0 +1,115 @@
 (() => {
    // Function to extract image data from Google Images page
    function extractImageData() {
        const keys = Object.keys(window.W_jd);
        let allImageData = [];
        let currentPosition = 0;
        // Get the symbol we'll use (from first valid entry)
        let targetSymbol;
        for (let key of keys) {
            try {
                const symbols = Object.getOwnPropertySymbols(window.W_jd[key]);
                if (symbols.length > 0) {
                    targetSymbol = symbols[0];
                    break;
                }
            } catch (e) {
                continue;
            }
        }
        if (!targetSymbol) return [];
        // Iterate through ALL keys
        for (let key of keys) {
            try {
                const o1 = window.W_jd[key][targetSymbol]
                if (!o1) continue;
                const data = Object.values(o1)[0]
                // const data = window.W_jd[key][targetSymbol]?.Ws;
                // Check if this is a valid image data entry
                if (data && Array.isArray(data[1])) {
                    const processedData = processImageEntry(data, currentPosition);
                    if (processedData) {
                        allImageData.push(processedData);
                        currentPosition++;
                    }
                }
            } catch (e) {
                continue;
            }
        }
        return allImageData;
    }
    function processImageEntry(entry, position) {
        const imageData = entry[1];
        if (!Array.isArray(imageData)) return null;
        // Extract the image ID
        const imageId = imageData[1];
        if (!imageId) return null;
        // Find the corresponding DOM element
        const domElement = document.querySelector(`[data-docid="${imageId}"]`);
        if (!domElement) return null;
        // Extract data from the array structure
        const [
            _,
            id,
            thumbnailInfo,
            imageInfo,
            __,
            ___,
            rgb,
            ____,
            _____,
            metadata
        ] = imageData;
        // Ensure we have the required data
        if (!thumbnailInfo || !imageInfo) return null;
        // Extract metadata from DOM
        const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim();
        const source = domElement?.querySelector('.guK3rf')?.textContent?.trim();
        const link = domElement?.querySelector('a.EZAeBe')?.href;
        if (!link) return null;
        // Build Google Image URL
        const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]);
        return {
            title,
            imageUrl: imageInfo[0],
            imageWidth: imageInfo[2],
            imageHeight: imageInfo[1],
            thumbnailUrl: thumbnailInfo[0],
            thumbnailWidth: thumbnailInfo[2],
            thumbnailHeight: thumbnailInfo[1],
            source,
            domain: metadata['2000']?.[1] || new URL(link).hostname,
            link,
            googleUrl,
            position: position + 1
        };
    }
    function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) {
        const params = new URLSearchParams({
            imgurl: imgUrl,
            tbnid: tbnid,
            imgrefurl: refUrl,
            docid: tbnid,
            w: width.toString(),
            h: height.toString(),
        });
        return `https://www.google.com/imgres?${params.toString()}`;
    }
    return extractImageData();
 })();
--- a/crawl4ai/hub.py
+++ b/crawl4ai/hub.py
@@ -0,0 +1,73 @@
 import importlib
 import pkgutil
 from pathlib import Path
 import logging
 from typing import Dict, Type
 import inspect
 logger = logging.getLogger(__name__)
 # crawl4ai/base.py
 from abc import ABC, abstractmethod
 from typing import Optional, Dict, Any
 import json
 import logging
 class BaseCrawler(ABC):
    def __init__(self):
        self.logger = logging.getLogger(self.__class__.__name__)
    @abstractmethod
    async def run(self, url: str = "", **kwargs) -> str:
        """
        Implement this method to return JSON string.
        Must accept URL + arbitrary kwargs for flexibility.
        """
        pass
    def __init_subclass__(cls, **kwargs):
        """Enforce interface validation on subclassing"""
        super().__init_subclass__(**kwargs)
        # Verify run method signature
        run_method = cls.run
        if not run_method.__code__.co_argcount >= 2:  # self + url
            raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'")
        # Verify async nature
        if not inspect.iscoroutinefunction(run_method):
            raise TypeError(f"{cls.__name__}.run must be async")
 class CrawlerHub:
    _crawlers: Dict[str, Type[BaseCrawler]] = {}
    @classmethod
    def _discover_crawlers(cls):
        """Dynamically load crawlers from /crawlers in 3 lines"""
        base_path = Path(__file__).parent / "crawlers"
        for crawler_dir in base_path.iterdir():
            if crawler_dir.is_dir():
                try:
                    module = importlib.import_module(
                        f"crawl4ai.crawlers.{crawler_dir.name}.crawler"
                    )
                    for attr in dir(module):
                        cls._maybe_register_crawler(
                            getattr(module, attr), crawler_dir.name
                        )
                except Exception as e:
                    logger.warning(f"Failed {crawler_dir.name}: {str(e)}")
    @classmethod
    def _maybe_register_crawler(cls, obj, name: str):
        """Brilliant one-liner registration"""
        if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler:
            module = importlib.import_module(obj.__module__)
            obj.meta = getattr(module, "__meta__", {})
            cls._crawlers[name] = obj
    @classmethod
    def get(cls, name: str) -> Type[BaseCrawler] | None:
        if not cls._crawlers:
            cls._discover_crawlers()
        return cls._crawlers.get(name)
--- a/crawl4ai/legacy/init.py
+++ b/crawl4ai/legacy/init.py
--- a/crawl4ai/legacy/cli.py
+++ b/crawl4ai/legacy/cli.py
--- a/crawl4ai/legacy/crawler_strategy.py
+++ b/crawl4ai/legacy/crawler_strategy.py
--- a/crawl4ai/legacy/database.py
+++ b/crawl4ai/legacy/database.py
--- a/crawl4ai/legacy/docs_manager.py
+++ b/crawl4ai/legacy/docs_manager.py
--- a/crawl4ai/legacy/llmtxt.py
+++ b/crawl4ai/legacy/llmtxt.py
--- a/crawl4ai/legacy/version_manager.py
+++ b/crawl4ai/legacy/version_manager.py
--- a/crawl4ai/legacy/web_crawler.py
+++ b/crawl4ai/legacy/web_crawler.py
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -28,6 +28,35 @@ import hashlib
 from urllib.parse import urljoin, urlparse
 from urllib.robotparser import RobotFileParser
 import aiohttp
 from pathlib import Path
 from packaging import version
 from . import __version__
 class VersionManager:
    def __init__(self):
        self.home_dir = Path.home() / ".crawl4ai"
        self.version_file = self.home_dir / "version.txt"
    def get_installed_version(self):
        """Get the version recorded in home directory"""
        if not self.version_file.exists():
            return None
        try:
            return version.parse(self.version_file.read_text().strip())
        except:
            return None
    def update_version(self):
        """Update the version file to current library version"""
        self.version_file.write_text(__version__.__version__)
    def needs_update(self):
        """Check if database needs update based on version"""
        installed = self.get_installed_version()
        current = version.parse(__version__.__version__)
        return installed is None or installed < current
 class RobotsParser:
    # Default 7 days cache TTL
--- a/tests/20241401/test_crawlers.py
+++ b/tests/20241401/test_crawlers.py
@@ -0,0 +1,17 @@
 # example_usageexample_usageexample_usage# example_usage.py
 import asyncio
 from crawl4ai.crawlers import get_crawler
 async def main():
    # Get the registered crawler
    example_crawler = get_crawler("example_site.content")
    # Crawl example.com
    result = await example_crawler(url="https://example.com")
    print(result)
 if __name__ == "__main__":
    asyncio.run(main())
--- a/tests/hub/test_simple.py
+++ b/tests/hub/test_simple.py
@@ -0,0 +1,30 @@
 # test.py
 from crawl4ai import CrawlerHub
 import json
 async def amazon_example():
    if (crawler_cls := CrawlerHub.get("amazon_product")) :
        crawler = crawler_cls()
        print(f"Crawler version: {crawler_cls.meta['version']}")
        print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
        print(await crawler.run("https://amazon.com/test"))
    else:
        print("Crawler not found!")
 async def google_example():
    # Get crawler dynamically
    crawler_cls = CrawlerHub.get("google_search")
    crawler = crawler_cls()
    # Text search
    text_results = await crawler.run(query="apple inc", search_type="text",  schema_cache_path="/Users/unclecode/.crawl4ai")
    print(json.loads(text_results))
    # Image search
    image_results = await crawler.run(query="apple inc", search_type="image")
    print(image_results)
 if __name__ == "__main__":
    import asyncio
    # asyncio.run(amazon_example())
    asyncio.run(google_example())