feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control
This commit is contained in:
@@ -9,7 +9,7 @@ from typing import Dict, Any
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
||||
self.base_url = base_url
|
||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback
|
||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback
|
||||
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
||||
|
||||
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
||||
@@ -44,6 +44,16 @@ class Crawl4AiTester:
|
||||
raise TimeoutError("Task did not complete within server timeout")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Directly crawl without using task queue"""
|
||||
response = requests.post(
|
||||
f"{self.base_url}/crawl_direct",
|
||||
json=request_data,
|
||||
headers=self.headers
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def test_docker_deployment(version="basic"):
|
||||
tester = Crawl4AiTester(
|
||||
@@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"):
|
||||
time.sleep(5)
|
||||
|
||||
# Test cases based on version
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl(tester)
|
||||
test_basic_crawl_sync(tester)
|
||||
# test_basic_crawl(tester)
|
||||
# test_basic_crawl(tester)
|
||||
# test_basic_crawl_sync(tester)
|
||||
test_basic_crawl_direct(tester)
|
||||
|
||||
# if version in ["full", "transformer"]:
|
||||
# test_cosine_extraction(tester)
|
||||
@@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
def test_basic_crawl_direct(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Direct) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10,
|
||||
# "session_id": "test"
|
||||
"cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only"
|
||||
}
|
||||
|
||||
result = tester.crawl_direct(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
assert result['result']['success']
|
||||
assert len(result['result']['markdown']) > 0
|
||||
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
|
||||
79
docs/md_v2/basic/cache-modes.md
Normal file
79
docs/md_v2/basic/cache-modes.md
Normal file
@@ -0,0 +1,79 @@
|
||||
# Crawl4AI Cache System and Migration Guide
|
||||
|
||||
## Overview
|
||||
Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
|
||||
|
||||
## Old vs New Approach
|
||||
|
||||
### Old Way (Deprecated)
|
||||
The old system used multiple boolean flags:
|
||||
- `bypass_cache`: Skip cache entirely
|
||||
- `disable_cache`: Disable all caching
|
||||
- `no_cache_read`: Don't read from cache
|
||||
- `no_cache_write`: Don't write to cache
|
||||
|
||||
### New Way (Recommended)
|
||||
The new system uses a single `CacheMode` enum:
|
||||
- `CacheMode.ENABLED`: Normal caching (read/write)
|
||||
- `CacheMode.DISABLED`: No caching at all
|
||||
- `CacheMode.READ_ONLY`: Only read from cache
|
||||
- `CacheMode.WRITE_ONLY`: Only write to cache
|
||||
- `CacheMode.BYPASS`: Skip cache for this operation
|
||||
|
||||
## Migration Example
|
||||
|
||||
### Old Code (Deprecated)
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def use_proxy():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
bypass_cache=True # Old way
|
||||
)
|
||||
print(len(result.markdown))
|
||||
|
||||
async def main():
|
||||
await use_proxy()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### New Code (Recommended)
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode
|
||||
|
||||
async def use_proxy():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
cache_mode=CacheMode.BYPASS # New way
|
||||
)
|
||||
print(len(result.markdown))
|
||||
|
||||
async def main():
|
||||
await use_proxy()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Common Migration Patterns
|
||||
|
||||
Old Flag | New Mode
|
||||
---------|----------
|
||||
`bypass_cache=True` | `cache_mode=CacheMode.BYPASS`
|
||||
`disable_cache=True` | `cache_mode=CacheMode.DISABLED`
|
||||
`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY`
|
||||
`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY`
|
||||
|
||||
## Suppressing Deprecation Warnings
|
||||
If you need time to migrate, you can temporarily suppress deprecation warnings:
|
||||
```python
|
||||
# In your config.py
|
||||
SHOW_DEPRECATION_WARNINGS = False
|
||||
```
|
||||
Reference in New Issue
Block a user