refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -0,0 +1,125 @@
+from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.hub import BaseCrawler
+from crawl4ai.utils import optimize_html, get_home_folder
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from pathlib import Path
+import json
+import os
+import asyncio
+from typing import Dict, Any
+
+
+class GoogleSearchCrawler(BaseCrawler):
+    __meta__ = {
+        "version": "1.0.0",
+        "tested_on": ["google.com/search*"],
+        "rate_limit": "10 RPM",
+        "description": "Crawls Google Search results (text + images)",
+    }
+
+    def __init__(self):
+        super().__init__()
+        self.js_script = (Path(__file__).parent /
+                          "script.js").read_text()
+
+    async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str:
+        """Crawl Google Search results for a query"""
+        url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2"
+        browser_config = BrowserConfig(headless=True, verbose=True)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+                delay_before_return_html=kwargs.get(
+                    "delay", 2 if search_type == "image" else 1),
+                js_code=self.js_script if search_type == "image" else None,
+            )
+
+            result = await crawler.arun(url=url, config=config)
+            if not result.success:
+                return json.dumps({"error": result.error})
+
+            if search_type == "image":
+                if result.js_execution_result.get("success", False) is False:
+                    return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")})
+                if "results" in result.js_execution_result:
+                    image_result = result.js_execution_result['results'][0]
+                    if image_result.get("success", False) is False:
+                        return json.dumps({"error": image_result.get("error", "Unknown error")})
+                    return json.dumps(image_result["result"], indent=4)
+
+            # For text search, extract structured data
+            schemas = await self._build_schemas(result.cleaned_html, schema_cache_path)
+            extracted = {
+                key: JsonCssExtractionStrategy(schema=schemas[key]).run(
+                    url=url, sections=[result.html]
+                )
+                for key in schemas
+            }
+            return json.dumps(extracted, indent=4)
+
+    async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]:
+        """Build extraction schemas (organic, top stories, etc.)"""
+        home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
+        os.makedirs(f"{home_dir}/schema", exist_ok=True)
+
+        cleaned_html = optimize_html(html, threshold=100)
+
+        organic_schema = None
+        if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
+            with open(f"{home_dir}/schema/organic_schema.json", "r") as f:
+                organic_schema = json.load(f)
+        else:
+            organic_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "snippet": "...",
+            "date": "1 hour ago",
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date."""
+            )
+
+            with open(f"{home_dir}/schema/organic_schema.json", "w") as f:
+                f.write(json.dumps(organic_schema))
+
+        top_stories_schema = None
+        if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"):
+            with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f:
+                top_stories_schema = json.load(f)
+        else:
+            top_stories_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "source": "Insider Monkey",
+            "date": "1 hour ago",
+            "imageUrl": "..."
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
+            )
+
+            with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f:
+                f.write(json.dumps(top_stories_schema))
+
+        suggested_query_schema = None
+        if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"):
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f:
+                suggested_query_schema = json.load(f)
+        else:
+            suggested_query_schema = JsonCssExtractionStrategy.generate_schema(
+                html=_html,
+                target_json_example="""{
+            "query": "A for Apple",
+        }""",
+                query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only."""
+            )
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f:
+                f.write(json.dumps(suggested_query_schema))
+
+        return {
+            "organic_schema": organic_schema,
+            "top_stories_schema": top_stories_schema,
+            "suggested_query_schema": suggested_query_schema,
+        }