From f81712eb91dbb9ed12af61a7d68ac0ad41200b93 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 30 Jan 2025 19:35:06 +0800 Subject: [PATCH] refactor(core): reorganize project structure and remove legacy code Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler. --- crawl4ai/__init__.py | 4 + crawl4ai/async_crawler_strategy.py | 2 + crawl4ai/async_database.py | 2 +- crawl4ai/async_webcrawler.py | 10 +- crawl4ai/chunking_strategy.py | 2 +- crawl4ai/crawlers/__init__.py | 0 crawl4ai/crawlers/amazon_product/__init__.py | 0 crawl4ai/crawlers/amazon_product/crawler.py | 20 +++ crawl4ai/crawlers/google_search/__init__.py | 0 crawl4ai/crawlers/google_search/crawler.py | 125 +++++++++++++++++++ crawl4ai/crawlers/google_search/script.js | 115 +++++++++++++++++ crawl4ai/hub.py | 73 +++++++++++ crawl4ai/legacy/__init__.py | 0 crawl4ai/{ => legacy}/cli.py | 0 crawl4ai/{ => legacy}/crawler_strategy.py | 0 crawl4ai/{ => legacy}/database.py | 0 crawl4ai/{ => legacy}/docs_manager.py | 0 crawl4ai/{ => legacy}/llmtxt.py | 0 crawl4ai/{ => legacy}/version_manager.py | 0 crawl4ai/{ => legacy}/web_crawler.py | 0 crawl4ai/utils.py | 29 +++++ tests/20241401/test_crawlers.py | 17 +++ tests/hub/test_simple.py | 30 +++++ 23 files changed, 425 insertions(+), 4 deletions(-) create mode 100644 crawl4ai/crawlers/__init__.py create mode 100644 crawl4ai/crawlers/amazon_product/__init__.py create mode 100644 crawl4ai/crawlers/amazon_product/crawler.py create mode 100644 crawl4ai/crawlers/google_search/__init__.py create mode 100644 crawl4ai/crawlers/google_search/crawler.py create mode 100644 crawl4ai/crawlers/google_search/script.js create mode 100644 crawl4ai/hub.py create mode 100644 crawl4ai/legacy/__init__.py rename crawl4ai/{ => legacy}/cli.py (100%) rename crawl4ai/{ => legacy}/crawler_strategy.py (100%) rename crawl4ai/{ => legacy}/database.py (100%) rename crawl4ai/{ => legacy}/docs_manager.py (100%) rename crawl4ai/{ => legacy}/llmtxt.py (100%) rename crawl4ai/{ => legacy}/version_manager.py (100%) rename crawl4ai/{ => legacy}/web_crawler.py (100%) create mode 100644 tests/20241401/test_crawlers.py create mode 100644 tests/hub/test_simple.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 7f284323..fd3558e3 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -14,6 +14,8 @@ from .extraction_strategy import ( JsonCssExtractionStrategy, JsonXPathExtractionStrategy ) + + from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter @@ -26,10 +28,12 @@ from .async_dispatcher import ( DisplayMode, BaseDispatcher ) +from .hub import CrawlerHub __all__ = [ "AsyncWebCrawler", "CrawlResult", + "CrawlerHub", "CacheMode", "ContentScrapingStrategy", "WebScrapingStrategy", diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a8330060..9ae9b5a8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1265,6 +1265,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ config.url = url response_headers = {} + execution_result = None status_code = None redirected_url = url @@ -1522,6 +1523,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): execution_result = await self.robust_execute_user_script( page, config.js_code ) + if not execution_result["success"]: self.logger.warning( message="User script execution had issues: {error}", diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index b0c20f29..66a7c683 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -9,7 +9,7 @@ import json # Added for serialization/deserialization from .utils import ensure_content_dirs, generate_content_hash from .models import CrawlResult, MarkdownGenerationResult import aiofiles -from .version_manager import VersionManager +from .utils import VersionManager from .async_logger import AsyncLogger from .utils import get_error_context, create_box_message diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 56a31620..746641ec 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -49,6 +49,12 @@ from collections.abc import AsyncGenerator CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] +DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] +DeepCrawlManyReturn = Union[ + List[List[CrawlResultT]], + AsyncGenerator[CrawlResultT, None], +] + from .__version__ import __version__ as crawl4ai_version @@ -282,7 +288,7 @@ class AsyncWebCrawler: user_agent: str = None, verbose=True, **kwargs, - ) -> CrawlResult: + ) -> Union[CrawlResult, DeepCrawlSingleReturn]: """ Runs the crawler for a single source: URL (web, local file, or raw HTML). @@ -709,7 +715,7 @@ class AsyncWebCrawler: user_agent: str = None, verbose=True, **kwargs - ) -> RunManyReturn: + ) -> Union[RunManyReturn, DeepCrawlManyReturn]: """ Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy. diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index ca188d1d..f46cb667 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -4,7 +4,6 @@ from collections import Counter import string from .model_loader import load_nltk_punkt - # Define the abstract base class for chunking strategies class ChunkingStrategy(ABC): """ @@ -72,6 +71,7 @@ class NlpSentenceChunking(ChunkingStrategy): """ Initialize the NlpSentenceChunking object. """ + from crawl4ai.le.legacy.model_loader import load_nltk_punkt load_nltk_punkt() def chunk(self, text: str) -> list: diff --git a/crawl4ai/crawlers/__init__.py b/crawl4ai/crawlers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/crawlers/amazon_product/__init__.py b/crawl4ai/crawlers/amazon_product/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/crawlers/amazon_product/crawler.py b/crawl4ai/crawlers/amazon_product/crawler.py new file mode 100644 index 00000000..45cc9d6a --- /dev/null +++ b/crawl4ai/crawlers/amazon_product/crawler.py @@ -0,0 +1,20 @@ +from crawl4ai.hub import BaseCrawler + +__meta__ = { + "version": "1.2.0", + "tested_on": ["amazon.com"], + "rate_limit": "50 RPM", + "schema": {"product": ["name", "price"]} +} + +class AmazonProductCrawler(BaseCrawler): + async def run(self, url: str, **kwargs) -> str: + try: + self.logger.info(f"Crawling {url}") + return '{"product": {"name": "Test Amazon Product"}}' + except Exception as e: + self.logger.error(f"Crawl failed: {str(e)}") + return json.dumps({ + "error": str(e), + "metadata": self.meta # Include meta in error response + }) \ No newline at end of file diff --git a/crawl4ai/crawlers/google_search/__init__.py b/crawl4ai/crawlers/google_search/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py new file mode 100644 index 00000000..b1f7d725 --- /dev/null +++ b/crawl4ai/crawlers/google_search/crawler.py @@ -0,0 +1,125 @@ +from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.hub import BaseCrawler +from crawl4ai.utils import optimize_html, get_home_folder +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from pathlib import Path +import json +import os +import asyncio +from typing import Dict, Any + + +class GoogleSearchCrawler(BaseCrawler): + __meta__ = { + "version": "1.0.0", + "tested_on": ["google.com/search*"], + "rate_limit": "10 RPM", + "description": "Crawls Google Search results (text + images)", + } + + def __init__(self): + super().__init__() + self.js_script = (Path(__file__).parent / + "script.js").read_text() + + async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str: + """Crawl Google Search results for a query""" + url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2" + browser_config = BrowserConfig(headless=True, verbose=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + config = CrawlerRunConfig( + cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS), + delay_before_return_html=kwargs.get( + "delay", 2 if search_type == "image" else 1), + js_code=self.js_script if search_type == "image" else None, + ) + + result = await crawler.arun(url=url, config=config) + if not result.success: + return json.dumps({"error": result.error}) + + if search_type == "image": + if result.js_execution_result.get("success", False) is False: + return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")}) + if "results" in result.js_execution_result: + image_result = result.js_execution_result['results'][0] + if image_result.get("success", False) is False: + return json.dumps({"error": image_result.get("error", "Unknown error")}) + return json.dumps(image_result["result"], indent=4) + + # For text search, extract structured data + schemas = await self._build_schemas(result.cleaned_html, schema_cache_path) + extracted = { + key: JsonCssExtractionStrategy(schema=schemas[key]).run( + url=url, sections=[result.html] + ) + for key in schemas + } + return json.dumps(extracted, indent=4) + + async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]: + """Build extraction schemas (organic, top stories, etc.)""" + home_dir = get_home_folder() if not schema_cache_path else schema_cache_path + os.makedirs(f"{home_dir}/schema", exist_ok=True) + + cleaned_html = optimize_html(html, threshold=100) + + organic_schema = None + if os.path.exists(f"{home_dir}/schema/organic_schema.json"): + with open(f"{home_dir}/schema/organic_schema.json", "r") as f: + organic_schema = json.load(f) + else: + organic_schema = JsonCssExtractionStrategy.generate_schema( + html=_html, + target_json_example="""{ + "title": "...", + "link": "...", + "snippet": "...", + "date": "1 hour ago", + }""", + query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date.""" + ) + + with open(f"{home_dir}/schema/organic_schema.json", "w") as f: + f.write(json.dumps(organic_schema)) + + top_stories_schema = None + if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"): + with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f: + top_stories_schema = json.load(f) + else: + top_stories_schema = JsonCssExtractionStrategy.generate_schema( + html=_html, + target_json_example="""{ + "title": "...", + "link": "...", + "source": "Insider Monkey", + "date": "1 hour ago", + "imageUrl": "..." + }""", + query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl.""" + ) + + with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f: + f.write(json.dumps(top_stories_schema)) + + suggested_query_schema = None + if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"): + with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f: + suggested_query_schema = json.load(f) + else: + suggested_query_schema = JsonCssExtractionStrategy.generate_schema( + html=_html, + target_json_example="""{ + "query": "A for Apple", + }""", + query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only.""" + ) + with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f: + f.write(json.dumps(suggested_query_schema)) + + return { + "organic_schema": organic_schema, + "top_stories_schema": top_stories_schema, + "suggested_query_schema": suggested_query_schema, + } diff --git a/crawl4ai/crawlers/google_search/script.js b/crawl4ai/crawlers/google_search/script.js new file mode 100644 index 00000000..33257465 --- /dev/null +++ b/crawl4ai/crawlers/google_search/script.js @@ -0,0 +1,115 @@ +(() => { + // Function to extract image data from Google Images page + function extractImageData() { + const keys = Object.keys(window.W_jd); + let allImageData = []; + let currentPosition = 0; + + // Get the symbol we'll use (from first valid entry) + let targetSymbol; + for (let key of keys) { + try { + const symbols = Object.getOwnPropertySymbols(window.W_jd[key]); + if (symbols.length > 0) { + targetSymbol = symbols[0]; + break; + } + } catch (e) { + continue; + } + } + + if (!targetSymbol) return []; + + // Iterate through ALL keys + for (let key of keys) { + try { + const o1 = window.W_jd[key][targetSymbol] + if (!o1) continue; + const data = Object.values(o1)[0] + // const data = window.W_jd[key][targetSymbol]?.Ws; + // Check if this is a valid image data entry + if (data && Array.isArray(data[1])) { + const processedData = processImageEntry(data, currentPosition); + if (processedData) { + allImageData.push(processedData); + currentPosition++; + } + } + } catch (e) { + continue; + } + } + + return allImageData; + } + + function processImageEntry(entry, position) { + const imageData = entry[1]; + if (!Array.isArray(imageData)) return null; + + // Extract the image ID + const imageId = imageData[1]; + if (!imageId) return null; + + // Find the corresponding DOM element + const domElement = document.querySelector(`[data-docid="${imageId}"]`); + if (!domElement) return null; + + // Extract data from the array structure + const [ + _, + id, + thumbnailInfo, + imageInfo, + __, + ___, + rgb, + ____, + _____, + metadata + ] = imageData; + + // Ensure we have the required data + if (!thumbnailInfo || !imageInfo) return null; + + // Extract metadata from DOM + const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim(); + const source = domElement?.querySelector('.guK3rf')?.textContent?.trim(); + const link = domElement?.querySelector('a.EZAeBe')?.href; + + if (!link) return null; + + // Build Google Image URL + const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]); + + return { + title, + imageUrl: imageInfo[0], + imageWidth: imageInfo[2], + imageHeight: imageInfo[1], + thumbnailUrl: thumbnailInfo[0], + thumbnailWidth: thumbnailInfo[2], + thumbnailHeight: thumbnailInfo[1], + source, + domain: metadata['2000']?.[1] || new URL(link).hostname, + link, + googleUrl, + position: position + 1 + }; + } + + function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) { + const params = new URLSearchParams({ + imgurl: imgUrl, + tbnid: tbnid, + imgrefurl: refUrl, + docid: tbnid, + w: width.toString(), + h: height.toString(), + }); + + return `https://www.google.com/imgres?${params.toString()}`; + } + return extractImageData(); +})(); \ No newline at end of file diff --git a/crawl4ai/hub.py b/crawl4ai/hub.py new file mode 100644 index 00000000..fa6976f3 --- /dev/null +++ b/crawl4ai/hub.py @@ -0,0 +1,73 @@ +import importlib +import pkgutil +from pathlib import Path +import logging +from typing import Dict, Type +import inspect + +logger = logging.getLogger(__name__) + +# crawl4ai/base.py +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any +import json +import logging + +class BaseCrawler(ABC): + def __init__(self): + self.logger = logging.getLogger(self.__class__.__name__) + + @abstractmethod + async def run(self, url: str = "", **kwargs) -> str: + """ + Implement this method to return JSON string. + Must accept URL + arbitrary kwargs for flexibility. + """ + pass + + def __init_subclass__(cls, **kwargs): + """Enforce interface validation on subclassing""" + super().__init_subclass__(**kwargs) + + # Verify run method signature + run_method = cls.run + if not run_method.__code__.co_argcount >= 2: # self + url + raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'") + + # Verify async nature + if not inspect.iscoroutinefunction(run_method): + raise TypeError(f"{cls.__name__}.run must be async") + +class CrawlerHub: + _crawlers: Dict[str, Type[BaseCrawler]] = {} + + @classmethod + def _discover_crawlers(cls): + """Dynamically load crawlers from /crawlers in 3 lines""" + base_path = Path(__file__).parent / "crawlers" + for crawler_dir in base_path.iterdir(): + if crawler_dir.is_dir(): + try: + module = importlib.import_module( + f"crawl4ai.crawlers.{crawler_dir.name}.crawler" + ) + for attr in dir(module): + cls._maybe_register_crawler( + getattr(module, attr), crawler_dir.name + ) + except Exception as e: + logger.warning(f"Failed {crawler_dir.name}: {str(e)}") + + @classmethod + def _maybe_register_crawler(cls, obj, name: str): + """Brilliant one-liner registration""" + if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler: + module = importlib.import_module(obj.__module__) + obj.meta = getattr(module, "__meta__", {}) + cls._crawlers[name] = obj + + @classmethod + def get(cls, name: str) -> Type[BaseCrawler] | None: + if not cls._crawlers: + cls._discover_crawlers() + return cls._crawlers.get(name) \ No newline at end of file diff --git a/crawl4ai/legacy/__init__.py b/crawl4ai/legacy/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/cli.py b/crawl4ai/legacy/cli.py similarity index 100% rename from crawl4ai/cli.py rename to crawl4ai/legacy/cli.py diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/legacy/crawler_strategy.py similarity index 100% rename from crawl4ai/crawler_strategy.py rename to crawl4ai/legacy/crawler_strategy.py diff --git a/crawl4ai/database.py b/crawl4ai/legacy/database.py similarity index 100% rename from crawl4ai/database.py rename to crawl4ai/legacy/database.py diff --git a/crawl4ai/docs_manager.py b/crawl4ai/legacy/docs_manager.py similarity index 100% rename from crawl4ai/docs_manager.py rename to crawl4ai/legacy/docs_manager.py diff --git a/crawl4ai/llmtxt.py b/crawl4ai/legacy/llmtxt.py similarity index 100% rename from crawl4ai/llmtxt.py rename to crawl4ai/legacy/llmtxt.py diff --git a/crawl4ai/version_manager.py b/crawl4ai/legacy/version_manager.py similarity index 100% rename from crawl4ai/version_manager.py rename to crawl4ai/legacy/version_manager.py diff --git a/crawl4ai/web_crawler.py b/crawl4ai/legacy/web_crawler.py similarity index 100% rename from crawl4ai/web_crawler.py rename to crawl4ai/legacy/web_crawler.py diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 51e88985..19975f24 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -28,6 +28,35 @@ import hashlib from urllib.parse import urljoin, urlparse from urllib.robotparser import RobotFileParser import aiohttp +from pathlib import Path +from packaging import version +from . import __version__ + + +class VersionManager: + def __init__(self): + self.home_dir = Path.home() / ".crawl4ai" + self.version_file = self.home_dir / "version.txt" + + def get_installed_version(self): + """Get the version recorded in home directory""" + if not self.version_file.exists(): + return None + try: + return version.parse(self.version_file.read_text().strip()) + except: + return None + + def update_version(self): + """Update the version file to current library version""" + self.version_file.write_text(__version__.__version__) + + def needs_update(self): + """Check if database needs update based on version""" + installed = self.get_installed_version() + current = version.parse(__version__.__version__) + return installed is None or installed < current + class RobotsParser: # Default 7 days cache TTL diff --git a/tests/20241401/test_crawlers.py b/tests/20241401/test_crawlers.py new file mode 100644 index 00000000..45fb8fcb --- /dev/null +++ b/tests/20241401/test_crawlers.py @@ -0,0 +1,17 @@ + +# example_usageexample_usageexample_usage# example_usage.py +import asyncio +from crawl4ai.crawlers import get_crawler + +async def main(): + # Get the registered crawler + example_crawler = get_crawler("example_site.content") + + # Crawl example.com + result = await example_crawler(url="https://example.com") + + print(result) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tests/hub/test_simple.py b/tests/hub/test_simple.py new file mode 100644 index 00000000..8eee5eaa --- /dev/null +++ b/tests/hub/test_simple.py @@ -0,0 +1,30 @@ +# test.py +from crawl4ai import CrawlerHub +import json + +async def amazon_example(): + if (crawler_cls := CrawlerHub.get("amazon_product")) : + crawler = crawler_cls() + print(f"Crawler version: {crawler_cls.meta['version']}") + print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}") + print(await crawler.run("https://amazon.com/test")) + else: + print("Crawler not found!") + +async def google_example(): + # Get crawler dynamically + crawler_cls = CrawlerHub.get("google_search") + crawler = crawler_cls() + + # Text search + text_results = await crawler.run(query="apple inc", search_type="text", schema_cache_path="/Users/unclecode/.crawl4ai") + print(json.loads(text_results)) + + # Image search + image_results = await crawler.run(query="apple inc", search_type="image") + print(image_results) + +if __name__ == "__main__": + import asyncio + # asyncio.run(amazon_example()) + asyncio.run(google_example()) \ No newline at end of file