refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions
--- a/crawl4ai/crawlers/init.py
+++ b/crawl4ai/crawlers/init.py
--- a/crawl4ai/crawlers/amazon_product/init.py
+++ b/crawl4ai/crawlers/amazon_product/init.py
--- a/crawl4ai/crawlers/amazon_product/crawler.py
+++ b/crawl4ai/crawlers/amazon_product/crawler.py
@@ -0,0 +1,20 @@
+from crawl4ai.hub import BaseCrawler
+
+__meta__ = {
+    "version": "1.2.0",
+    "tested_on": ["amazon.com"],
+    "rate_limit": "50 RPM",
+    "schema": {"product": ["name", "price"]}
+}
+
+class AmazonProductCrawler(BaseCrawler):
+    async def run(self, url: str, **kwargs) -> str:
+        try:
+            self.logger.info(f"Crawling {url}")
+            return '{"product": {"name": "Test Amazon Product"}}'
+        except Exception as e:
+            self.logger.error(f"Crawl failed: {str(e)}")
+            return json.dumps({
+                "error": str(e),
+                "metadata": self.meta  # Include meta in error response
+            })            
--- a/crawl4ai/crawlers/google_search/init.py
+++ b/crawl4ai/crawlers/google_search/init.py
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -0,0 +1,125 @@
+from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.hub import BaseCrawler
+from crawl4ai.utils import optimize_html, get_home_folder
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from pathlib import Path
+import json
+import os
+import asyncio
+from typing import Dict, Any
+
+
+class GoogleSearchCrawler(BaseCrawler):
+    __meta__ = {
+        "version": "1.0.0",
+        "tested_on": ["google.com/search*"],
+        "rate_limit": "10 RPM",
+        "description": "Crawls Google Search results (text + images)",
+    }
+
+    def __init__(self):
+        super().__init__()
+        self.js_script = (Path(__file__).parent /
+                          "script.js").read_text()
+
+    async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str:
+        """Crawl Google Search results for a query"""
+        url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2"
+        browser_config = BrowserConfig(headless=True, verbose=True)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+                delay_before_return_html=kwargs.get(
+                    "delay", 2 if search_type == "image" else 1),
+                js_code=self.js_script if search_type == "image" else None,
+            )
+
+            result = await crawler.arun(url=url, config=config)
+            if not result.success:
+                return json.dumps({"error": result.error})
+
+            if search_type == "image":
+                if result.js_execution_result.get("success", False) is False:
+                    return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")})
+                if "results" in result.js_execution_result:
+                    image_result = result.js_execution_result['results'][0]
+                    if image_result.get("success", False) is False:
+                        return json.dumps({"error": image_result.get("error", "Unknown error")})
+                    return json.dumps(image_result["result"], indent=4)
+
+            # For text search, extract structured data
+            schemas = await self._build_schemas(result.cleaned_html, schema_cache_path)
+            extracted = {
+                key: JsonCssExtractionStrategy(schema=schemas[key]).run(
+                    url=url, sections=[result.html]
+                )
+                for key in schemas
+            }
+            return json.dumps(extracted, indent=4)
+
+    async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]:
+        """Build extraction schemas (organic, top stories, etc.)"""
+        home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
+        os.makedirs(f"{home_dir}/schema", exist_ok=True)
+
+        cleaned_html = optimize_html(html, threshold=100)
+
+        organic_schema = None
+        if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
+            with open(f"{home_dir}/schema/organic_schema.json", "r") as f:
+                organic_schema = json.load(f)
+        else:
+            organic_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "snippet": "...",
+            "date": "1 hour ago",
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date."""
+            )
+
+            with open(f"{home_dir}/schema/organic_schema.json", "w") as f:
+                f.write(json.dumps(organic_schema))
+
+        top_stories_schema = None
+        if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"):
+            with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f:
+                top_stories_schema = json.load(f)
+        else:
+            top_stories_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "source": "Insider Monkey",
+            "date": "1 hour ago",
+            "imageUrl": "..."
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
+            )
+
+            with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f:
+                f.write(json.dumps(top_stories_schema))
+
+        suggested_query_schema = None
+        if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"):
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f:
+                suggested_query_schema = json.load(f)
+        else:
+            suggested_query_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "query": "A for Apple",
+        }""",
+                query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only."""
+            )
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f:
+                f.write(json.dumps(suggested_query_schema))
+
+        return {
+            "organic_schema": organic_schema,
+            "top_stories_schema": top_stories_schema,
+            "suggested_query_schema": suggested_query_schema,
+        }
--- a/crawl4ai/crawlers/google_search/script.js
+++ b/crawl4ai/crawlers/google_search/script.js
@@ -0,0 +1,115 @@
+(() => {
+    // Function to extract image data from Google Images page
+    function extractImageData() {
+        const keys = Object.keys(window.W_jd);
+        let allImageData = [];
+        let currentPosition = 0;
+
+        // Get the symbol we'll use (from first valid entry)
+        let targetSymbol;
+        for (let key of keys) {
+            try {
+                const symbols = Object.getOwnPropertySymbols(window.W_jd[key]);
+                if (symbols.length > 0) {
+                    targetSymbol = symbols[0];
+                    break;
+                }
+            } catch (e) {
+                continue;
+            }
+        }
+
+        if (!targetSymbol) return [];
+
+        // Iterate through ALL keys
+        for (let key of keys) {
+            try {
+                const o1 = window.W_jd[key][targetSymbol]
+                if (!o1) continue;
+                const data = Object.values(o1)[0]
+                // const data = window.W_jd[key][targetSymbol]?.Ws;
+                // Check if this is a valid image data entry
+                if (data && Array.isArray(data[1])) {
+                    const processedData = processImageEntry(data, currentPosition);
+                    if (processedData) {
+                        allImageData.push(processedData);
+                        currentPosition++;
+                    }
+                }
+            } catch (e) {
+                continue;
+            }
+        }
+
+        return allImageData;
+    }
+
+    function processImageEntry(entry, position) {
+        const imageData = entry[1];
+        if (!Array.isArray(imageData)) return null;
+
+        // Extract the image ID
+        const imageId = imageData[1];
+        if (!imageId) return null;
+
+        // Find the corresponding DOM element
+        const domElement = document.querySelector(`[data-docid="${imageId}"]`);
+        if (!domElement) return null;
+
+        // Extract data from the array structure
+        const [
+            _,
+            id,
+            thumbnailInfo,
+            imageInfo,
+            __,
+            ___,
+            rgb,
+            ____,
+            _____,
+            metadata
+        ] = imageData;
+
+        // Ensure we have the required data
+        if (!thumbnailInfo || !imageInfo) return null;
+
+        // Extract metadata from DOM
+        const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim();
+        const source = domElement?.querySelector('.guK3rf')?.textContent?.trim();
+        const link = domElement?.querySelector('a.EZAeBe')?.href;
+
+        if (!link) return null;
+
+        // Build Google Image URL
+        const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]);
+
+        return {
+            title,
+            imageUrl: imageInfo[0],
+            imageWidth: imageInfo[2],
+            imageHeight: imageInfo[1],
+            thumbnailUrl: thumbnailInfo[0],
+            thumbnailWidth: thumbnailInfo[2],
+            thumbnailHeight: thumbnailInfo[1],
+            source,
+            domain: metadata['2000']?.[1] || new URL(link).hostname,
+            link,
+            googleUrl,
+            position: position + 1
+        };
+    }
+
+    function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) {
+        const params = new URLSearchParams({
+            imgurl: imgUrl,
+            tbnid: tbnid,
+            imgrefurl: refUrl,
+            docid: tbnid,
+            w: width.toString(),
+            h: height.toString(),
+        });
+
+        return `https://www.google.com/imgres?${params.toString()}`;
+    }
+    return extractImageData();
+})();