refactor(core): reorganize project structure and remove legacy code
Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
This commit is contained in:
@@ -14,6 +14,8 @@ from .extraction_strategy import (
|
|||||||
JsonCssExtractionStrategy,
|
JsonCssExtractionStrategy,
|
||||||
JsonXPathExtractionStrategy
|
JsonXPathExtractionStrategy
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
|
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
|
||||||
@@ -26,10 +28,12 @@ from .async_dispatcher import (
|
|||||||
DisplayMode,
|
DisplayMode,
|
||||||
BaseDispatcher
|
BaseDispatcher
|
||||||
)
|
)
|
||||||
|
from .hub import CrawlerHub
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
"CrawlResult",
|
"CrawlResult",
|
||||||
|
"CrawlerHub",
|
||||||
"CacheMode",
|
"CacheMode",
|
||||||
"ContentScrapingStrategy",
|
"ContentScrapingStrategy",
|
||||||
"WebScrapingStrategy",
|
"WebScrapingStrategy",
|
||||||
|
|||||||
@@ -1265,6 +1265,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"""
|
"""
|
||||||
config.url = url
|
config.url = url
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
|
execution_result = None
|
||||||
status_code = None
|
status_code = None
|
||||||
redirected_url = url
|
redirected_url = url
|
||||||
|
|
||||||
@@ -1522,6 +1523,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
execution_result = await self.robust_execute_user_script(
|
execution_result = await self.robust_execute_user_script(
|
||||||
page, config.js_code
|
page, config.js_code
|
||||||
)
|
)
|
||||||
|
|
||||||
if not execution_result["success"]:
|
if not execution_result["success"]:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
message="User script execution had issues: {error}",
|
message="User script execution had issues: {error}",
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import json # Added for serialization/deserialization
|
|||||||
from .utils import ensure_content_dirs, generate_content_hash
|
from .utils import ensure_content_dirs, generate_content_hash
|
||||||
from .models import CrawlResult, MarkdownGenerationResult
|
from .models import CrawlResult, MarkdownGenerationResult
|
||||||
import aiofiles
|
import aiofiles
|
||||||
from .version_manager import VersionManager
|
from .utils import VersionManager
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
from .utils import get_error_context, create_box_message
|
from .utils import get_error_context, create_box_message
|
||||||
|
|
||||||
|
|||||||
@@ -49,6 +49,12 @@ from collections.abc import AsyncGenerator
|
|||||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||||
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||||
|
|
||||||
|
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||||
|
DeepCrawlManyReturn = Union[
|
||||||
|
List[List[CrawlResultT]],
|
||||||
|
AsyncGenerator[CrawlResultT, None],
|
||||||
|
]
|
||||||
|
|
||||||
from .__version__ import __version__ as crawl4ai_version
|
from .__version__ import __version__ as crawl4ai_version
|
||||||
|
|
||||||
|
|
||||||
@@ -282,7 +288,7 @@ class AsyncWebCrawler:
|
|||||||
user_agent: str = None,
|
user_agent: str = None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> CrawlResult:
|
) -> Union[CrawlResult, DeepCrawlSingleReturn]:
|
||||||
"""
|
"""
|
||||||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||||||
|
|
||||||
@@ -709,7 +715,7 @@ class AsyncWebCrawler:
|
|||||||
user_agent: str = None,
|
user_agent: str = None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> RunManyReturn:
|
) -> Union[RunManyReturn, DeepCrawlManyReturn]:
|
||||||
"""
|
"""
|
||||||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ from collections import Counter
|
|||||||
import string
|
import string
|
||||||
from .model_loader import load_nltk_punkt
|
from .model_loader import load_nltk_punkt
|
||||||
|
|
||||||
|
|
||||||
# Define the abstract base class for chunking strategies
|
# Define the abstract base class for chunking strategies
|
||||||
class ChunkingStrategy(ABC):
|
class ChunkingStrategy(ABC):
|
||||||
"""
|
"""
|
||||||
@@ -72,6 +71,7 @@ class NlpSentenceChunking(ChunkingStrategy):
|
|||||||
"""
|
"""
|
||||||
Initialize the NlpSentenceChunking object.
|
Initialize the NlpSentenceChunking object.
|
||||||
"""
|
"""
|
||||||
|
from crawl4ai.le.legacy.model_loader import load_nltk_punkt
|
||||||
load_nltk_punkt()
|
load_nltk_punkt()
|
||||||
|
|
||||||
def chunk(self, text: str) -> list:
|
def chunk(self, text: str) -> list:
|
||||||
|
|||||||
0
crawl4ai/crawlers/__init__.py
Normal file
0
crawl4ai/crawlers/__init__.py
Normal file
0
crawl4ai/crawlers/amazon_product/__init__.py
Normal file
0
crawl4ai/crawlers/amazon_product/__init__.py
Normal file
20
crawl4ai/crawlers/amazon_product/crawler.py
Normal file
20
crawl4ai/crawlers/amazon_product/crawler.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from crawl4ai.hub import BaseCrawler
|
||||||
|
|
||||||
|
__meta__ = {
|
||||||
|
"version": "1.2.0",
|
||||||
|
"tested_on": ["amazon.com"],
|
||||||
|
"rate_limit": "50 RPM",
|
||||||
|
"schema": {"product": ["name", "price"]}
|
||||||
|
}
|
||||||
|
|
||||||
|
class AmazonProductCrawler(BaseCrawler):
|
||||||
|
async def run(self, url: str, **kwargs) -> str:
|
||||||
|
try:
|
||||||
|
self.logger.info(f"Crawling {url}")
|
||||||
|
return '{"product": {"name": "Test Amazon Product"}}'
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Crawl failed: {str(e)}")
|
||||||
|
return json.dumps({
|
||||||
|
"error": str(e),
|
||||||
|
"metadata": self.meta # Include meta in error response
|
||||||
|
})
|
||||||
0
crawl4ai/crawlers/google_search/__init__.py
Normal file
0
crawl4ai/crawlers/google_search/__init__.py
Normal file
125
crawl4ai/crawlers/google_search/crawler.py
Normal file
125
crawl4ai/crawlers/google_search/crawler.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
|
from crawl4ai.hub import BaseCrawler
|
||||||
|
from crawl4ai.utils import optimize_html, get_home_folder
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleSearchCrawler(BaseCrawler):
|
||||||
|
__meta__ = {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"tested_on": ["google.com/search*"],
|
||||||
|
"rate_limit": "10 RPM",
|
||||||
|
"description": "Crawls Google Search results (text + images)",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.js_script = (Path(__file__).parent /
|
||||||
|
"script.js").read_text()
|
||||||
|
|
||||||
|
async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str:
|
||||||
|
"""Crawl Google Search results for a query"""
|
||||||
|
url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2"
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
|
||||||
|
delay_before_return_html=kwargs.get(
|
||||||
|
"delay", 2 if search_type == "image" else 1),
|
||||||
|
js_code=self.js_script if search_type == "image" else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await crawler.arun(url=url, config=config)
|
||||||
|
if not result.success:
|
||||||
|
return json.dumps({"error": result.error})
|
||||||
|
|
||||||
|
if search_type == "image":
|
||||||
|
if result.js_execution_result.get("success", False) is False:
|
||||||
|
return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")})
|
||||||
|
if "results" in result.js_execution_result:
|
||||||
|
image_result = result.js_execution_result['results'][0]
|
||||||
|
if image_result.get("success", False) is False:
|
||||||
|
return json.dumps({"error": image_result.get("error", "Unknown error")})
|
||||||
|
return json.dumps(image_result["result"], indent=4)
|
||||||
|
|
||||||
|
# For text search, extract structured data
|
||||||
|
schemas = await self._build_schemas(result.cleaned_html, schema_cache_path)
|
||||||
|
extracted = {
|
||||||
|
key: JsonCssExtractionStrategy(schema=schemas[key]).run(
|
||||||
|
url=url, sections=[result.html]
|
||||||
|
)
|
||||||
|
for key in schemas
|
||||||
|
}
|
||||||
|
return json.dumps(extracted, indent=4)
|
||||||
|
|
||||||
|
async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]:
|
||||||
|
"""Build extraction schemas (organic, top stories, etc.)"""
|
||||||
|
home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
|
||||||
|
os.makedirs(f"{home_dir}/schema", exist_ok=True)
|
||||||
|
|
||||||
|
cleaned_html = optimize_html(html, threshold=100)
|
||||||
|
|
||||||
|
organic_schema = None
|
||||||
|
if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
|
||||||
|
with open(f"{home_dir}/schema/organic_schema.json", "r") as f:
|
||||||
|
organic_schema = json.load(f)
|
||||||
|
else:
|
||||||
|
organic_schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
|
html=_html,
|
||||||
|
target_json_example="""{
|
||||||
|
"title": "...",
|
||||||
|
"link": "...",
|
||||||
|
"snippet": "...",
|
||||||
|
"date": "1 hour ago",
|
||||||
|
}""",
|
||||||
|
query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date."""
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(f"{home_dir}/schema/organic_schema.json", "w") as f:
|
||||||
|
f.write(json.dumps(organic_schema))
|
||||||
|
|
||||||
|
top_stories_schema = None
|
||||||
|
if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"):
|
||||||
|
with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f:
|
||||||
|
top_stories_schema = json.load(f)
|
||||||
|
else:
|
||||||
|
top_stories_schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
|
html=_html,
|
||||||
|
target_json_example="""{
|
||||||
|
"title": "...",
|
||||||
|
"link": "...",
|
||||||
|
"source": "Insider Monkey",
|
||||||
|
"date": "1 hour ago",
|
||||||
|
"imageUrl": "..."
|
||||||
|
}""",
|
||||||
|
query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f:
|
||||||
|
f.write(json.dumps(top_stories_schema))
|
||||||
|
|
||||||
|
suggested_query_schema = None
|
||||||
|
if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"):
|
||||||
|
with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f:
|
||||||
|
suggested_query_schema = json.load(f)
|
||||||
|
else:
|
||||||
|
suggested_query_schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
|
html=_html,
|
||||||
|
target_json_example="""{
|
||||||
|
"query": "A for Apple",
|
||||||
|
}""",
|
||||||
|
query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only."""
|
||||||
|
)
|
||||||
|
with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f:
|
||||||
|
f.write(json.dumps(suggested_query_schema))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"organic_schema": organic_schema,
|
||||||
|
"top_stories_schema": top_stories_schema,
|
||||||
|
"suggested_query_schema": suggested_query_schema,
|
||||||
|
}
|
||||||
115
crawl4ai/crawlers/google_search/script.js
Normal file
115
crawl4ai/crawlers/google_search/script.js
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
(() => {
|
||||||
|
// Function to extract image data from Google Images page
|
||||||
|
function extractImageData() {
|
||||||
|
const keys = Object.keys(window.W_jd);
|
||||||
|
let allImageData = [];
|
||||||
|
let currentPosition = 0;
|
||||||
|
|
||||||
|
// Get the symbol we'll use (from first valid entry)
|
||||||
|
let targetSymbol;
|
||||||
|
for (let key of keys) {
|
||||||
|
try {
|
||||||
|
const symbols = Object.getOwnPropertySymbols(window.W_jd[key]);
|
||||||
|
if (symbols.length > 0) {
|
||||||
|
targetSymbol = symbols[0];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!targetSymbol) return [];
|
||||||
|
|
||||||
|
// Iterate through ALL keys
|
||||||
|
for (let key of keys) {
|
||||||
|
try {
|
||||||
|
const o1 = window.W_jd[key][targetSymbol]
|
||||||
|
if (!o1) continue;
|
||||||
|
const data = Object.values(o1)[0]
|
||||||
|
// const data = window.W_jd[key][targetSymbol]?.Ws;
|
||||||
|
// Check if this is a valid image data entry
|
||||||
|
if (data && Array.isArray(data[1])) {
|
||||||
|
const processedData = processImageEntry(data, currentPosition);
|
||||||
|
if (processedData) {
|
||||||
|
allImageData.push(processedData);
|
||||||
|
currentPosition++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return allImageData;
|
||||||
|
}
|
||||||
|
|
||||||
|
function processImageEntry(entry, position) {
|
||||||
|
const imageData = entry[1];
|
||||||
|
if (!Array.isArray(imageData)) return null;
|
||||||
|
|
||||||
|
// Extract the image ID
|
||||||
|
const imageId = imageData[1];
|
||||||
|
if (!imageId) return null;
|
||||||
|
|
||||||
|
// Find the corresponding DOM element
|
||||||
|
const domElement = document.querySelector(`[data-docid="${imageId}"]`);
|
||||||
|
if (!domElement) return null;
|
||||||
|
|
||||||
|
// Extract data from the array structure
|
||||||
|
const [
|
||||||
|
_,
|
||||||
|
id,
|
||||||
|
thumbnailInfo,
|
||||||
|
imageInfo,
|
||||||
|
__,
|
||||||
|
___,
|
||||||
|
rgb,
|
||||||
|
____,
|
||||||
|
_____,
|
||||||
|
metadata
|
||||||
|
] = imageData;
|
||||||
|
|
||||||
|
// Ensure we have the required data
|
||||||
|
if (!thumbnailInfo || !imageInfo) return null;
|
||||||
|
|
||||||
|
// Extract metadata from DOM
|
||||||
|
const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim();
|
||||||
|
const source = domElement?.querySelector('.guK3rf')?.textContent?.trim();
|
||||||
|
const link = domElement?.querySelector('a.EZAeBe')?.href;
|
||||||
|
|
||||||
|
if (!link) return null;
|
||||||
|
|
||||||
|
// Build Google Image URL
|
||||||
|
const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
imageUrl: imageInfo[0],
|
||||||
|
imageWidth: imageInfo[2],
|
||||||
|
imageHeight: imageInfo[1],
|
||||||
|
thumbnailUrl: thumbnailInfo[0],
|
||||||
|
thumbnailWidth: thumbnailInfo[2],
|
||||||
|
thumbnailHeight: thumbnailInfo[1],
|
||||||
|
source,
|
||||||
|
domain: metadata['2000']?.[1] || new URL(link).hostname,
|
||||||
|
link,
|
||||||
|
googleUrl,
|
||||||
|
position: position + 1
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) {
|
||||||
|
const params = new URLSearchParams({
|
||||||
|
imgurl: imgUrl,
|
||||||
|
tbnid: tbnid,
|
||||||
|
imgrefurl: refUrl,
|
||||||
|
docid: tbnid,
|
||||||
|
w: width.toString(),
|
||||||
|
h: height.toString(),
|
||||||
|
});
|
||||||
|
|
||||||
|
return `https://www.google.com/imgres?${params.toString()}`;
|
||||||
|
}
|
||||||
|
return extractImageData();
|
||||||
|
})();
|
||||||
73
crawl4ai/hub.py
Normal file
73
crawl4ai/hub.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import importlib
|
||||||
|
import pkgutil
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Type
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# crawl4ai/base.py
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
class BaseCrawler(ABC):
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def run(self, url: str = "", **kwargs) -> str:
|
||||||
|
"""
|
||||||
|
Implement this method to return JSON string.
|
||||||
|
Must accept URL + arbitrary kwargs for flexibility.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __init_subclass__(cls, **kwargs):
|
||||||
|
"""Enforce interface validation on subclassing"""
|
||||||
|
super().__init_subclass__(**kwargs)
|
||||||
|
|
||||||
|
# Verify run method signature
|
||||||
|
run_method = cls.run
|
||||||
|
if not run_method.__code__.co_argcount >= 2: # self + url
|
||||||
|
raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'")
|
||||||
|
|
||||||
|
# Verify async nature
|
||||||
|
if not inspect.iscoroutinefunction(run_method):
|
||||||
|
raise TypeError(f"{cls.__name__}.run must be async")
|
||||||
|
|
||||||
|
class CrawlerHub:
|
||||||
|
_crawlers: Dict[str, Type[BaseCrawler]] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _discover_crawlers(cls):
|
||||||
|
"""Dynamically load crawlers from /crawlers in 3 lines"""
|
||||||
|
base_path = Path(__file__).parent / "crawlers"
|
||||||
|
for crawler_dir in base_path.iterdir():
|
||||||
|
if crawler_dir.is_dir():
|
||||||
|
try:
|
||||||
|
module = importlib.import_module(
|
||||||
|
f"crawl4ai.crawlers.{crawler_dir.name}.crawler"
|
||||||
|
)
|
||||||
|
for attr in dir(module):
|
||||||
|
cls._maybe_register_crawler(
|
||||||
|
getattr(module, attr), crawler_dir.name
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed {crawler_dir.name}: {str(e)}")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _maybe_register_crawler(cls, obj, name: str):
|
||||||
|
"""Brilliant one-liner registration"""
|
||||||
|
if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler:
|
||||||
|
module = importlib.import_module(obj.__module__)
|
||||||
|
obj.meta = getattr(module, "__meta__", {})
|
||||||
|
cls._crawlers[name] = obj
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get(cls, name: str) -> Type[BaseCrawler] | None:
|
||||||
|
if not cls._crawlers:
|
||||||
|
cls._discover_crawlers()
|
||||||
|
return cls._crawlers.get(name)
|
||||||
0
crawl4ai/legacy/__init__.py
Normal file
0
crawl4ai/legacy/__init__.py
Normal file
@@ -28,6 +28,35 @@ import hashlib
|
|||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
from pathlib import Path
|
||||||
|
from packaging import version
|
||||||
|
from . import __version__
|
||||||
|
|
||||||
|
|
||||||
|
class VersionManager:
|
||||||
|
def __init__(self):
|
||||||
|
self.home_dir = Path.home() / ".crawl4ai"
|
||||||
|
self.version_file = self.home_dir / "version.txt"
|
||||||
|
|
||||||
|
def get_installed_version(self):
|
||||||
|
"""Get the version recorded in home directory"""
|
||||||
|
if not self.version_file.exists():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return version.parse(self.version_file.read_text().strip())
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def update_version(self):
|
||||||
|
"""Update the version file to current library version"""
|
||||||
|
self.version_file.write_text(__version__.__version__)
|
||||||
|
|
||||||
|
def needs_update(self):
|
||||||
|
"""Check if database needs update based on version"""
|
||||||
|
installed = self.get_installed_version()
|
||||||
|
current = version.parse(__version__.__version__)
|
||||||
|
return installed is None or installed < current
|
||||||
|
|
||||||
|
|
||||||
class RobotsParser:
|
class RobotsParser:
|
||||||
# Default 7 days cache TTL
|
# Default 7 days cache TTL
|
||||||
|
|||||||
17
tests/20241401/test_crawlers.py
Normal file
17
tests/20241401/test_crawlers.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
|
||||||
|
# example_usageexample_usageexample_usage# example_usage.py
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai.crawlers import get_crawler
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Get the registered crawler
|
||||||
|
example_crawler = get_crawler("example_site.content")
|
||||||
|
|
||||||
|
# Crawl example.com
|
||||||
|
result = await example_crawler(url="https://example.com")
|
||||||
|
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
30
tests/hub/test_simple.py
Normal file
30
tests/hub/test_simple.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
# test.py
|
||||||
|
from crawl4ai import CrawlerHub
|
||||||
|
import json
|
||||||
|
|
||||||
|
async def amazon_example():
|
||||||
|
if (crawler_cls := CrawlerHub.get("amazon_product")) :
|
||||||
|
crawler = crawler_cls()
|
||||||
|
print(f"Crawler version: {crawler_cls.meta['version']}")
|
||||||
|
print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
|
||||||
|
print(await crawler.run("https://amazon.com/test"))
|
||||||
|
else:
|
||||||
|
print("Crawler not found!")
|
||||||
|
|
||||||
|
async def google_example():
|
||||||
|
# Get crawler dynamically
|
||||||
|
crawler_cls = CrawlerHub.get("google_search")
|
||||||
|
crawler = crawler_cls()
|
||||||
|
|
||||||
|
# Text search
|
||||||
|
text_results = await crawler.run(query="apple inc", search_type="text", schema_cache_path="/Users/unclecode/.crawl4ai")
|
||||||
|
print(json.loads(text_results))
|
||||||
|
|
||||||
|
# Image search
|
||||||
|
image_results = await crawler.run(query="apple inc", search_type="image")
|
||||||
|
print(image_results)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
# asyncio.run(amazon_example())
|
||||||
|
asyncio.run(google_example())
|
||||||
Reference in New Issue
Block a user