feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control
This commit is contained in:
@@ -9,7 +9,7 @@ from typing import Dict, Any
|
|||||||
class Crawl4AiTester:
|
class Crawl4AiTester:
|
||||||
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback
|
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback
|
||||||
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
||||||
|
|
||||||
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
||||||
@@ -44,6 +44,16 @@ class Crawl4AiTester:
|
|||||||
raise TimeoutError("Task did not complete within server timeout")
|
raise TimeoutError("Task did not complete within server timeout")
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
|
def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""Directly crawl without using task queue"""
|
||||||
|
response = requests.post(
|
||||||
|
f"{self.base_url}/crawl_direct",
|
||||||
|
json=request_data,
|
||||||
|
headers=self.headers
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
def test_docker_deployment(version="basic"):
|
def test_docker_deployment(version="basic"):
|
||||||
tester = Crawl4AiTester(
|
tester = Crawl4AiTester(
|
||||||
@@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"):
|
|||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
# Test cases based on version
|
# Test cases based on version
|
||||||
test_basic_crawl(tester)
|
# test_basic_crawl(tester)
|
||||||
test_basic_crawl(tester)
|
# test_basic_crawl(tester)
|
||||||
test_basic_crawl_sync(tester)
|
# test_basic_crawl_sync(tester)
|
||||||
|
test_basic_crawl_direct(tester)
|
||||||
|
|
||||||
# if version in ["full", "transformer"]:
|
# if version in ["full", "transformer"]:
|
||||||
# test_cosine_extraction(tester)
|
# test_cosine_extraction(tester)
|
||||||
@@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
|
|||||||
assert result['result']['success']
|
assert result['result']['success']
|
||||||
assert len(result['result']['markdown']) > 0
|
assert len(result['result']['markdown']) > 0
|
||||||
|
|
||||||
|
def test_basic_crawl_direct(tester: Crawl4AiTester):
|
||||||
|
print("\n=== Testing Basic Crawl (Direct) ===")
|
||||||
|
request = {
|
||||||
|
"urls": "https://www.nbcnews.com/business",
|
||||||
|
"priority": 10,
|
||||||
|
# "session_id": "test"
|
||||||
|
"cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only"
|
||||||
|
}
|
||||||
|
|
||||||
|
result = tester.crawl_direct(request)
|
||||||
|
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||||
|
assert result['result']['success']
|
||||||
|
assert len(result['result']['markdown']) > 0
|
||||||
|
|
||||||
def test_js_execution(tester: Crawl4AiTester):
|
def test_js_execution(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
request = {
|
request = {
|
||||||
|
|||||||
79
docs/md_v2/basic/cache-modes.md
Normal file
79
docs/md_v2/basic/cache-modes.md
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
# Crawl4AI Cache System and Migration Guide
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
|
||||||
|
|
||||||
|
## Old vs New Approach
|
||||||
|
|
||||||
|
### Old Way (Deprecated)
|
||||||
|
The old system used multiple boolean flags:
|
||||||
|
- `bypass_cache`: Skip cache entirely
|
||||||
|
- `disable_cache`: Disable all caching
|
||||||
|
- `no_cache_read`: Don't read from cache
|
||||||
|
- `no_cache_write`: Don't write to cache
|
||||||
|
|
||||||
|
### New Way (Recommended)
|
||||||
|
The new system uses a single `CacheMode` enum:
|
||||||
|
- `CacheMode.ENABLED`: Normal caching (read/write)
|
||||||
|
- `CacheMode.DISABLED`: No caching at all
|
||||||
|
- `CacheMode.READ_ONLY`: Only read from cache
|
||||||
|
- `CacheMode.WRITE_ONLY`: Only write to cache
|
||||||
|
- `CacheMode.BYPASS`: Skip cache for this operation
|
||||||
|
|
||||||
|
## Migration Example
|
||||||
|
|
||||||
|
### Old Code (Deprecated)
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def use_proxy():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
bypass_cache=True # Old way
|
||||||
|
)
|
||||||
|
print(len(result.markdown))
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
await use_proxy()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
### New Code (Recommended)
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode
|
||||||
|
|
||||||
|
async def use_proxy():
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://www.nbcnews.com/business",
|
||||||
|
cache_mode=CacheMode.BYPASS # New way
|
||||||
|
)
|
||||||
|
print(len(result.markdown))
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
await use_proxy()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Migration Patterns
|
||||||
|
|
||||||
|
Old Flag | New Mode
|
||||||
|
---------|----------
|
||||||
|
`bypass_cache=True` | `cache_mode=CacheMode.BYPASS`
|
||||||
|
`disable_cache=True` | `cache_mode=CacheMode.DISABLED`
|
||||||
|
`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY`
|
||||||
|
`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY`
|
||||||
|
|
||||||
|
## Suppressing Deprecation Warnings
|
||||||
|
If you need time to migrate, you can temporarily suppress deprecation warnings:
|
||||||
|
```python
|
||||||
|
# In your config.py
|
||||||
|
SHOW_DEPRECATION_WARNINGS = False
|
||||||
|
```
|
||||||
46
main.py
46
main.py
@@ -25,7 +25,7 @@ import logging
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import json
|
import json
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlResult
|
from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
|
||||||
from crawl4ai.extraction_strategy import (
|
from crawl4ai.extraction_strategy import (
|
||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
CosineStrategy,
|
CosineStrategy,
|
||||||
@@ -66,6 +66,7 @@ class CrawlRequest(BaseModel):
|
|||||||
magic: bool = False
|
magic: bool = False
|
||||||
extra: Optional[Dict[str, Any]] = {}
|
extra: Optional[Dict[str, Any]] = {}
|
||||||
session_id: Optional[str] = None
|
session_id: Optional[str] = None
|
||||||
|
cache_mode: Optional[CacheMode] = None
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TaskInfo:
|
class TaskInfo:
|
||||||
@@ -329,7 +330,7 @@ app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages"
|
|||||||
|
|
||||||
# API token security
|
# API token security
|
||||||
security = HTTPBearer()
|
security = HTTPBearer()
|
||||||
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
|
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
|
||||||
|
|
||||||
async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
|
async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
|
||||||
if not CRAWL4AI_API_TOKEN:
|
if not CRAWL4AI_API_TOKEN:
|
||||||
@@ -419,6 +420,47 @@ async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
|
|||||||
# If we get here, task didn't complete within timeout
|
# If we get here, task didn't complete within timeout
|
||||||
raise HTTPException(status_code=408, detail="Task timed out")
|
raise HTTPException(status_code=408, detail="Task timed out")
|
||||||
|
|
||||||
|
@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
|
||||||
|
async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
|
||||||
|
extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if isinstance(request.urls, list):
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=[str(url) for url in request.urls],
|
||||||
|
extraction_strategy=extraction_strategy,
|
||||||
|
js_code=request.js_code,
|
||||||
|
wait_for=request.wait_for,
|
||||||
|
css_selector=request.css_selector,
|
||||||
|
screenshot=request.screenshot,
|
||||||
|
magic=request.magic,
|
||||||
|
cache_mode=request.cache_mode,
|
||||||
|
session_id=request.session_id,
|
||||||
|
**request.extra,
|
||||||
|
)
|
||||||
|
return {"results": [result.dict() for result in results]}
|
||||||
|
else:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=str(request.urls),
|
||||||
|
extraction_strategy=extraction_strategy,
|
||||||
|
js_code=request.js_code,
|
||||||
|
wait_for=request.wait_for,
|
||||||
|
css_selector=request.css_selector,
|
||||||
|
screenshot=request.screenshot,
|
||||||
|
magic=request.magic,
|
||||||
|
cache_mode=request.cache_mode,
|
||||||
|
session_id=request.session_id,
|
||||||
|
**request.extra,
|
||||||
|
)
|
||||||
|
return {"result": result.dict()}
|
||||||
|
finally:
|
||||||
|
await crawler_service.crawler_pool.release(crawler)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in direct crawl: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health_check():
|
async def health_check():
|
||||||
available_slots = await crawler_service.resource_monitor.get_available_slots()
|
available_slots = await crawler_service.resource_monitor.get_available_slots()
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ nav:
|
|||||||
- 'Browser Configuration': 'basic/browser-config.md'
|
- 'Browser Configuration': 'basic/browser-config.md'
|
||||||
- 'Page Interaction': 'basic/page-interaction.md'
|
- 'Page Interaction': 'basic/page-interaction.md'
|
||||||
- 'Content Selection': 'basic/content-selection.md'
|
- 'Content Selection': 'basic/content-selection.md'
|
||||||
|
- 'Cache Modes': 'basic/cache-modes.md'
|
||||||
|
|
||||||
- Advanced:
|
- Advanced:
|
||||||
- 'Content Processing': 'advanced/content-processing.md'
|
- 'Content Processing': 'advanced/content-processing.md'
|
||||||
|
|||||||
Reference in New Issue
Block a user