refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -14,6 +14,8 @@ from .extraction_strategy import (
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy
 )
+
+
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
 from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
@@ -26,10 +28,12 @@ from .async_dispatcher import (
    DisplayMode,
    BaseDispatcher
 )
+from .hub import CrawlerHub

 __all__ = [
    "AsyncWebCrawler",
    "CrawlResult",
+    "CrawlerHub",
    "CacheMode",
    "ContentScrapingStrategy",
    "WebScrapingStrategy",
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1265,6 +1265,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        """
        config.url = url
        response_headers = {}
+        execution_result = None
        status_code = None
        redirected_url = url 

@@ -1522,6 +1523,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                execution_result = await self.robust_execute_user_script(
                    page, config.js_code
                )
+
                if not execution_result["success"]:
                    self.logger.warning(
                        message="User script execution had issues: {error}",
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -9,7 +9,7 @@ import json  # Added for serialization/deserialization
 from .utils import ensure_content_dirs, generate_content_hash
 from .models import CrawlResult, MarkdownGenerationResult
 import aiofiles
-from .version_manager import VersionManager
+from .utils import VersionManager
 from .async_logger import AsyncLogger
 from .utils import get_error_context, create_box_message

--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -49,6 +49,12 @@ from collections.abc import AsyncGenerator
 CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
 RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]

+DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+DeepCrawlManyReturn = Union[
+    List[List[CrawlResultT]],
+    AsyncGenerator[CrawlResultT, None],
+]
+
 from .__version__ import __version__ as crawl4ai_version


@@ -282,7 +288,7 @@ class AsyncWebCrawler:
        user_agent: str = None,
        verbose=True,
        **kwargs,
-    ) -> CrawlResult:
+    ) -> Union[CrawlResult, DeepCrawlSingleReturn]:
        """
        Runs the crawler for a single source: URL (web, local file, or raw HTML).

@@ -709,7 +715,7 @@ class AsyncWebCrawler:
        user_agent: str = None,
        verbose=True,
        **kwargs
-        ) -> RunManyReturn:
+        ) -> Union[RunManyReturn, DeepCrawlManyReturn]:
        """
        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.

--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -4,7 +4,6 @@ from collections import Counter
 import string
 from .model_loader import load_nltk_punkt

-
 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
    """
@@ -72,6 +71,7 @@ class NlpSentenceChunking(ChunkingStrategy):
        """
        Initialize the NlpSentenceChunking object.
        """
+        from crawl4ai.le.legacy.model_loader import load_nltk_punkt
        load_nltk_punkt()

    def chunk(self, text: str) -> list:
--- a/crawl4ai/crawlers/init.py
+++ b/crawl4ai/crawlers/init.py
--- a/crawl4ai/crawlers/amazon_product/init.py
+++ b/crawl4ai/crawlers/amazon_product/init.py
--- a/crawl4ai/crawlers/amazon_product/crawler.py
+++ b/crawl4ai/crawlers/amazon_product/crawler.py
@@ -0,0 +1,20 @@
+from crawl4ai.hub import BaseCrawler
+
+__meta__ = {
+    "version": "1.2.0",
+    "tested_on": ["amazon.com"],
+    "rate_limit": "50 RPM",
+    "schema": {"product": ["name", "price"]}
+}
+
+class AmazonProductCrawler(BaseCrawler):
+    async def run(self, url: str, **kwargs) -> str:
+        try:
+            self.logger.info(f"Crawling {url}")
+            return '{"product": {"name": "Test Amazon Product"}}'
+        except Exception as e:
+            self.logger.error(f"Crawl failed: {str(e)}")
+            return json.dumps({
+                "error": str(e),
+                "metadata": self.meta  # Include meta in error response
+            })            
--- a/crawl4ai/crawlers/google_search/init.py
+++ b/crawl4ai/crawlers/google_search/init.py
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -0,0 +1,125 @@
+from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.hub import BaseCrawler
+from crawl4ai.utils import optimize_html, get_home_folder
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from pathlib import Path
+import json
+import os
+import asyncio
+from typing import Dict, Any
+
+
+class GoogleSearchCrawler(BaseCrawler):
+    __meta__ = {
+        "version": "1.0.0",
+        "tested_on": ["google.com/search*"],
+        "rate_limit": "10 RPM",
+        "description": "Crawls Google Search results (text + images)",
+    }
+
+    def __init__(self):
+        super().__init__()
+        self.js_script = (Path(__file__).parent /
+                          "script.js").read_text()
+
+    async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str:
+        """Crawl Google Search results for a query"""
+        url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2"
+        browser_config = BrowserConfig(headless=True, verbose=True)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+                delay_before_return_html=kwargs.get(
+                    "delay", 2 if search_type == "image" else 1),
+                js_code=self.js_script if search_type == "image" else None,
+            )
+
+            result = await crawler.arun(url=url, config=config)
+            if not result.success:
+                return json.dumps({"error": result.error})
+
+            if search_type == "image":
+                if result.js_execution_result.get("success", False) is False:
+                    return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")})
+                if "results" in result.js_execution_result:
+                    image_result = result.js_execution_result['results'][0]
+                    if image_result.get("success", False) is False:
+                        return json.dumps({"error": image_result.get("error", "Unknown error")})
+                    return json.dumps(image_result["result"], indent=4)
+
+            # For text search, extract structured data
+            schemas = await self._build_schemas(result.cleaned_html, schema_cache_path)
+            extracted = {
+                key: JsonCssExtractionStrategy(schema=schemas[key]).run(
+                    url=url, sections=[result.html]
+                )
+                for key in schemas
+            }
+            return json.dumps(extracted, indent=4)
+
+    async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]:
+        """Build extraction schemas (organic, top stories, etc.)"""
+        home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
+        os.makedirs(f"{home_dir}/schema", exist_ok=True)
+
+        cleaned_html = optimize_html(html, threshold=100)
+
+        organic_schema = None
+        if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
+            with open(f"{home_dir}/schema/organic_schema.json", "r") as f:
+                organic_schema = json.load(f)
+        else:
+            organic_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "snippet": "...",
+            "date": "1 hour ago",
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date."""
+            )
+
+            with open(f"{home_dir}/schema/organic_schema.json", "w") as f:
+                f.write(json.dumps(organic_schema))
+
+        top_stories_schema = None
+        if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"):
+            with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f:
+                top_stories_schema = json.load(f)
+        else:
+            top_stories_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "source": "Insider Monkey",
+            "date": "1 hour ago",
+            "imageUrl": "..."
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
+            )
+
+            with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f:
+                f.write(json.dumps(top_stories_schema))
+
+        suggested_query_schema = None
+        if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"):
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f:
+                suggested_query_schema = json.load(f)
+        else:
+            suggested_query_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "query": "A for Apple",
+        }""",
+                query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only."""
+            )
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f:
+                f.write(json.dumps(suggested_query_schema))
+
+        return {
+            "organic_schema": organic_schema,
+            "top_stories_schema": top_stories_schema,
+            "suggested_query_schema": suggested_query_schema,
+        }
--- a/crawl4ai/crawlers/google_search/script.js
+++ b/crawl4ai/crawlers/google_search/script.js
@@ -0,0 +1,115 @@
+(() => {
+    // Function to extract image data from Google Images page
+    function extractImageData() {
+        const keys = Object.keys(window.W_jd);
+        let allImageData = [];
+        let currentPosition = 0;
+
+        // Get the symbol we'll use (from first valid entry)
+        let targetSymbol;
+        for (let key of keys) {
+            try {
+                const symbols = Object.getOwnPropertySymbols(window.W_jd[key]);
+                if (symbols.length > 0) {
+                    targetSymbol = symbols[0];
+                    break;
+                }
+            } catch (e) {
+                continue;
+            }
+        }
+
+        if (!targetSymbol) return [];
+
+        // Iterate through ALL keys
+        for (let key of keys) {
+            try {
+                const o1 = window.W_jd[key][targetSymbol]
+                if (!o1) continue;
+                const data = Object.values(o1)[0]
+                // const data = window.W_jd[key][targetSymbol]?.Ws;
+                // Check if this is a valid image data entry
+                if (data && Array.isArray(data[1])) {
+                    const processedData = processImageEntry(data, currentPosition);
+                    if (processedData) {
+                        allImageData.push(processedData);
+                        currentPosition++;
+                    }
+                }
+            } catch (e) {
+                continue;
+            }
+        }
+
+        return allImageData;
+    }
+
+    function processImageEntry(entry, position) {
+        const imageData = entry[1];
+        if (!Array.isArray(imageData)) return null;
+
+        // Extract the image ID
+        const imageId = imageData[1];
+        if (!imageId) return null;
+
+        // Find the corresponding DOM element
+        const domElement = document.querySelector(`[data-docid="${imageId}"]`);
+        if (!domElement) return null;
+
+        // Extract data from the array structure
+        const [
+            _,
+            id,
+            thumbnailInfo,
+            imageInfo,
+            __,
+            ___,
+            rgb,
+            ____,
+            _____,
+            metadata
+        ] = imageData;
+
+        // Ensure we have the required data
+        if (!thumbnailInfo || !imageInfo) return null;
+
+        // Extract metadata from DOM
+        const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim();
+        const source = domElement?.querySelector('.guK3rf')?.textContent?.trim();
+        const link = domElement?.querySelector('a.EZAeBe')?.href;
+
+        if (!link) return null;
+
+        // Build Google Image URL
+        const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]);
+
+        return {
+            title,
+            imageUrl: imageInfo[0],
+            imageWidth: imageInfo[2],
+            imageHeight: imageInfo[1],
+            thumbnailUrl: thumbnailInfo[0],
+            thumbnailWidth: thumbnailInfo[2],
+            thumbnailHeight: thumbnailInfo[1],
+            source,
+            domain: metadata['2000']?.[1] || new URL(link).hostname,
+            link,
+            googleUrl,
+            position: position + 1
+        };
+    }
+
+    function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) {
+        const params = new URLSearchParams({
+            imgurl: imgUrl,
+            tbnid: tbnid,
+            imgrefurl: refUrl,
+            docid: tbnid,
+            w: width.toString(),
+            h: height.toString(),
+        });
+
+        return `https://www.google.com/imgres?${params.toString()}`;
+    }
+    return extractImageData();
+})();
--- a/crawl4ai/hub.py
+++ b/crawl4ai/hub.py
@@ -0,0 +1,73 @@
+import importlib
+import pkgutil
+from pathlib import Path
+import logging
+from typing import Dict, Type
+import inspect
+
+logger = logging.getLogger(__name__)
+
+# crawl4ai/base.py
+from abc import ABC, abstractmethod
+from typing import Optional, Dict, Any
+import json
+import logging
+
+class BaseCrawler(ABC):
+    def __init__(self):
+        self.logger = logging.getLogger(self.__class__.__name__)
+        
+    @abstractmethod
+    async def run(self, url: str = "", **kwargs) -> str:
+        """
+        Implement this method to return JSON string.
+        Must accept URL + arbitrary kwargs for flexibility.
+        """
+        pass
+
+    def __init_subclass__(cls, **kwargs):
+        """Enforce interface validation on subclassing"""
+        super().__init_subclass__(**kwargs)
+        
+        # Verify run method signature
+        run_method = cls.run
+        if not run_method.__code__.co_argcount >= 2:  # self + url
+            raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'")
+            
+        # Verify async nature
+        if not inspect.iscoroutinefunction(run_method):
+            raise TypeError(f"{cls.__name__}.run must be async")
+
+class CrawlerHub:
+    _crawlers: Dict[str, Type[BaseCrawler]] = {}
+
+    @classmethod
+    def _discover_crawlers(cls):
+        """Dynamically load crawlers from /crawlers in 3 lines"""
+        base_path = Path(__file__).parent / "crawlers"
+        for crawler_dir in base_path.iterdir():
+            if crawler_dir.is_dir():
+                try:
+                    module = importlib.import_module(
+                        f"crawl4ai.crawlers.{crawler_dir.name}.crawler"
+                    )
+                    for attr in dir(module):
+                        cls._maybe_register_crawler(
+                            getattr(module, attr), crawler_dir.name
+                        )
+                except Exception as e:
+                    logger.warning(f"Failed {crawler_dir.name}: {str(e)}")
+
+    @classmethod
+    def _maybe_register_crawler(cls, obj, name: str):
+        """Brilliant one-liner registration"""
+        if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler:
+            module = importlib.import_module(obj.__module__)
+            obj.meta = getattr(module, "__meta__", {})
+            cls._crawlers[name] = obj
+
+    @classmethod
+    def get(cls, name: str) -> Type[BaseCrawler] | None:
+        if not cls._crawlers:
+            cls._discover_crawlers()
+        return cls._crawlers.get(name)
--- a/crawl4ai/legacy/init.py
+++ b/crawl4ai/legacy/init.py
--- a/crawl4ai/legacy/cli.py
+++ b/crawl4ai/legacy/cli.py
--- a/crawl4ai/legacy/crawler_strategy.py
+++ b/crawl4ai/legacy/crawler_strategy.py
--- a/crawl4ai/legacy/database.py
+++ b/crawl4ai/legacy/database.py
--- a/crawl4ai/legacy/docs_manager.py
+++ b/crawl4ai/legacy/docs_manager.py
--- a/crawl4ai/legacy/llmtxt.py
+++ b/crawl4ai/legacy/llmtxt.py
--- a/crawl4ai/legacy/version_manager.py
+++ b/crawl4ai/legacy/version_manager.py
--- a/crawl4ai/legacy/web_crawler.py
+++ b/crawl4ai/legacy/web_crawler.py
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -28,6 +28,35 @@ import hashlib
 from urllib.parse import urljoin, urlparse
 from urllib.robotparser import RobotFileParser
 import aiohttp
+from pathlib import Path
+from packaging import version
+from . import __version__
+
+
+class VersionManager:
+    def __init__(self):
+        self.home_dir = Path.home() / ".crawl4ai"
+        self.version_file = self.home_dir / "version.txt"
+
+    def get_installed_version(self):
+        """Get the version recorded in home directory"""
+        if not self.version_file.exists():
+            return None
+        try:
+            return version.parse(self.version_file.read_text().strip())
+        except:
+            return None
+
+    def update_version(self):
+        """Update the version file to current library version"""
+        self.version_file.write_text(__version__.__version__)
+
+    def needs_update(self):
+        """Check if database needs update based on version"""
+        installed = self.get_installed_version()
+        current = version.parse(__version__.__version__)
+        return installed is None or installed < current
+

 class RobotsParser:
    # Default 7 days cache TTL
--- a/tests/20241401/test_crawlers.py
+++ b/tests/20241401/test_crawlers.py
@@ -0,0 +1,17 @@
+
+# example_usageexample_usageexample_usage# example_usage.py
+import asyncio
+from crawl4ai.crawlers import get_crawler
+
+async def main():
+    # Get the registered crawler
+    example_crawler = get_crawler("example_site.content")
+    
+    # Crawl example.com
+    result = await example_crawler(url="https://example.com")
+        
+    print(result)
+            
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/hub/test_simple.py
+++ b/tests/hub/test_simple.py
@@ -0,0 +1,30 @@
+# test.py
+from crawl4ai import CrawlerHub
+import json
+
+async def amazon_example():
+    if (crawler_cls := CrawlerHub.get("amazon_product")) :
+        crawler = crawler_cls()
+        print(f"Crawler version: {crawler_cls.meta['version']}")
+        print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
+        print(await crawler.run("https://amazon.com/test"))
+    else:
+        print("Crawler not found!")
+
+async def google_example():
+    # Get crawler dynamically
+    crawler_cls = CrawlerHub.get("google_search")
+    crawler = crawler_cls()
+
+    # Text search
+    text_results = await crawler.run(query="apple inc", search_type="text",  schema_cache_path="/Users/unclecode/.crawl4ai")
+    print(json.loads(text_results))
+
+    # Image search
+    image_results = await crawler.run(query="apple inc", search_type="image")
+    print(image_results)
+
+if __name__ == "__main__":
+    import asyncio
+    # asyncio.run(amazon_example())
+    asyncio.run(google_example())