feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control

This commit is contained in:
UncleCode
2024-11-17 17:17:34 +08:00
parent 3a524a3bdd
commit 2a82455b3d
4 changed files with 153 additions and 6 deletions

View File

@@ -9,7 +9,7 @@ from typing import Dict, Any
class Crawl4AiTester: class Crawl4AiTester:
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
self.base_url = base_url self.base_url = base_url
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
@@ -45,6 +45,16 @@ class Crawl4AiTester:
response.raise_for_status() response.raise_for_status()
return response.json() return response.json()
def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
"""Directly crawl without using task queue"""
response = requests.post(
f"{self.base_url}/crawl_direct",
json=request_data,
headers=self.headers
)
response.raise_for_status()
return response.json()
def test_docker_deployment(version="basic"): def test_docker_deployment(version="basic"):
tester = Crawl4AiTester( tester = Crawl4AiTester(
base_url="http://localhost:11235" , base_url="http://localhost:11235" ,
@@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"):
time.sleep(5) time.sleep(5)
# Test cases based on version # Test cases based on version
test_basic_crawl(tester) # test_basic_crawl(tester)
test_basic_crawl(tester) # test_basic_crawl(tester)
test_basic_crawl_sync(tester) # test_basic_crawl_sync(tester)
test_basic_crawl_direct(tester)
# if version in ["full", "transformer"]: # if version in ["full", "transformer"]:
# test_cosine_extraction(tester) # test_cosine_extraction(tester)
@@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
assert result['result']['success'] assert result['result']['success']
assert len(result['result']['markdown']) > 0 assert len(result['result']['markdown']) > 0
def test_basic_crawl_direct(tester: Crawl4AiTester):
print("\n=== Testing Basic Crawl (Direct) ===")
request = {
"urls": "https://www.nbcnews.com/business",
"priority": 10,
# "session_id": "test"
"cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only"
}
result = tester.crawl_direct(request)
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
assert result['result']['success']
assert len(result['result']['markdown']) > 0
def test_js_execution(tester: Crawl4AiTester): def test_js_execution(tester: Crawl4AiTester):
print("\n=== Testing JS Execution ===") print("\n=== Testing JS Execution ===")
request = { request = {

View File

@@ -0,0 +1,79 @@
# Crawl4AI Cache System and Migration Guide
## Overview
Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
## Old vs New Approach
### Old Way (Deprecated)
The old system used multiple boolean flags:
- `bypass_cache`: Skip cache entirely
- `disable_cache`: Disable all caching
- `no_cache_read`: Don't read from cache
- `no_cache_write`: Don't write to cache
### New Way (Recommended)
The new system uses a single `CacheMode` enum:
- `CacheMode.ENABLED`: Normal caching (read/write)
- `CacheMode.DISABLED`: No caching at all
- `CacheMode.READ_ONLY`: Only read from cache
- `CacheMode.WRITE_ONLY`: Only write to cache
- `CacheMode.BYPASS`: Skip cache for this operation
## Migration Example
### Old Code (Deprecated)
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def use_proxy():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
bypass_cache=True # Old way
)
print(len(result.markdown))
async def main():
await use_proxy()
if __name__ == "__main__":
asyncio.run(main())
```
### New Code (Recommended)
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode
async def use_proxy():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
cache_mode=CacheMode.BYPASS # New way
)
print(len(result.markdown))
async def main():
await use_proxy()
if __name__ == "__main__":
asyncio.run(main())
```
## Common Migration Patterns
Old Flag | New Mode
---------|----------
`bypass_cache=True` | `cache_mode=CacheMode.BYPASS`
`disable_cache=True` | `cache_mode=CacheMode.DISABLED`
`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY`
`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY`
## Suppressing Deprecation Warnings
If you need time to migrate, you can temporarily suppress deprecation warnings:
```python
# In your config.py
SHOW_DEPRECATION_WARNINGS = False
```

46
main.py
View File

@@ -25,7 +25,7 @@ import logging
from enum import Enum from enum import Enum
from dataclasses import dataclass from dataclasses import dataclass
import json import json
from crawl4ai import AsyncWebCrawler, CrawlResult from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
from crawl4ai.extraction_strategy import ( from crawl4ai.extraction_strategy import (
LLMExtractionStrategy, LLMExtractionStrategy,
CosineStrategy, CosineStrategy,
@@ -66,6 +66,7 @@ class CrawlRequest(BaseModel):
magic: bool = False magic: bool = False
extra: Optional[Dict[str, Any]] = {} extra: Optional[Dict[str, Any]] = {}
session_id: Optional[str] = None session_id: Optional[str] = None
cache_mode: Optional[CacheMode] = None
@dataclass @dataclass
class TaskInfo: class TaskInfo:
@@ -329,7 +330,7 @@ app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages"
# API token security # API token security
security = HTTPBearer() security = HTTPBearer()
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
if not CRAWL4AI_API_TOKEN: if not CRAWL4AI_API_TOKEN:
@@ -419,6 +420,47 @@ async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
# If we get here, task didn't complete within timeout # If we get here, task didn't complete within timeout
raise HTTPException(status_code=408, detail="Task timed out") raise HTTPException(status_code=408, detail="Task timed out")
@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
try:
crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
try:
if isinstance(request.urls, list):
results = await crawler.arun_many(
urls=[str(url) for url in request.urls],
extraction_strategy=extraction_strategy,
js_code=request.js_code,
wait_for=request.wait_for,
css_selector=request.css_selector,
screenshot=request.screenshot,
magic=request.magic,
cache_mode=request.cache_mode,
session_id=request.session_id,
**request.extra,
)
return {"results": [result.dict() for result in results]}
else:
result = await crawler.arun(
url=str(request.urls),
extraction_strategy=extraction_strategy,
js_code=request.js_code,
wait_for=request.wait_for,
css_selector=request.css_selector,
screenshot=request.screenshot,
magic=request.magic,
cache_mode=request.cache_mode,
session_id=request.session_id,
**request.extra,
)
return {"result": result.dict()}
finally:
await crawler_service.crawler_pool.release(crawler)
except Exception as e:
logger.error(f"Error in direct crawl: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health") @app.get("/health")
async def health_check(): async def health_check():
available_slots = await crawler_service.resource_monitor.get_available_slots() available_slots = await crawler_service.resource_monitor.get_available_slots()

View File

@@ -17,6 +17,7 @@ nav:
- 'Browser Configuration': 'basic/browser-config.md' - 'Browser Configuration': 'basic/browser-config.md'
- 'Page Interaction': 'basic/page-interaction.md' - 'Page Interaction': 'basic/page-interaction.md'
- 'Content Selection': 'basic/content-selection.md' - 'Content Selection': 'basic/content-selection.md'
- 'Cache Modes': 'basic/cache-modes.md'
- Advanced: - Advanced:
- 'Content Processing': 'advanced/content-processing.md' - 'Content Processing': 'advanced/content-processing.md'