feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control

2024-11-17 17:17:34 +08:00
parent 3a524a3bdd
commit 2a82455b3d
4 changed files with 153 additions and 6 deletions
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -9,7 +9,7 @@ from typing import Dict, Any
 class Crawl4AiTester:
    def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
        self.base_url = base_url
-        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN')  # Check environment variable as fallback
+        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"  # Check environment variable as fallback
        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
        
    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
@@ -44,6 +44,16 @@ class Crawl4AiTester:
            raise TimeoutError("Task did not complete within server timeout")
        response.raise_for_status()
        return response.json()
+    
+    def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Directly crawl without using task queue"""
+        response = requests.post(
+            f"{self.base_url}/crawl_direct", 
+            json=request_data, 
+            headers=self.headers
+        )
+        response.raise_for_status()
+        return response.json()

 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester(
@@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"):
            time.sleep(5)
    
    # Test cases based on version
-    test_basic_crawl(tester)
-    test_basic_crawl(tester)
-    test_basic_crawl_sync(tester)
+    # test_basic_crawl(tester)
+    # test_basic_crawl(tester)
+    # test_basic_crawl_sync(tester)
+    test_basic_crawl_direct(tester)
    
    # if version in ["full", "transformer"]:
    #     test_cosine_extraction(tester)
@@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
    assert result['result']['success']
    assert len(result['result']['markdown']) > 0
    
+def test_basic_crawl_direct(tester: Crawl4AiTester):
+    print("\n=== Testing Basic Crawl (Direct) ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 10,
+        # "session_id": "test"
+        "cache_mode": "bypass"  # or "enabled", "disabled", "read_only", "write_only"
+    }
+    
+    result = tester.crawl_direct(request)
+    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+    assert result['result']['success']
+    assert len(result['result']['markdown']) > 0
+    
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
--- a/docs/md_v2/basic/cache-modes.md
+++ b/docs/md_v2/basic/cache-modes.md
@@ -0,0 +1,79 @@
+# Crawl4AI Cache System and Migration Guide
+
+## Overview
+Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
+
+## Old vs New Approach
+
+### Old Way (Deprecated)
+The old system used multiple boolean flags:
+- `bypass_cache`: Skip cache entirely
+- `disable_cache`: Disable all caching
+- `no_cache_read`: Don't read from cache
+- `no_cache_write`: Don't write to cache
+
+### New Way (Recommended)
+The new system uses a single `CacheMode` enum:
+- `CacheMode.ENABLED`: Normal caching (read/write)
+- `CacheMode.DISABLED`: No caching at all
+- `CacheMode.READ_ONLY`: Only read from cache
+- `CacheMode.WRITE_ONLY`: Only write to cache
+- `CacheMode.BYPASS`: Skip cache for this operation
+
+## Migration Example
+
+### Old Code (Deprecated)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def use_proxy():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            bypass_cache=True  # Old way
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### New Code (Recommended)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode  # Import CacheMode
+
+async def use_proxy():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            cache_mode=CacheMode.BYPASS  # New way
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Common Migration Patterns
+
+Old Flag | New Mode
+---------|----------
+`bypass_cache=True` | `cache_mode=CacheMode.BYPASS`
+`disable_cache=True` | `cache_mode=CacheMode.DISABLED`
+`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY`
+`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY`
+
+## Suppressing Deprecation Warnings
+If you need time to migrate, you can temporarily suppress deprecation warnings:
+```python
+# In your config.py
+SHOW_DEPRECATION_WARNINGS = False
+```
--- a/main.py
+++ b/main.py
@@ -25,7 +25,7 @@ import logging
 from enum import Enum
 from dataclasses import dataclass
 import json
-from crawl4ai import AsyncWebCrawler, CrawlResult
+from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
 from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    CosineStrategy,
@@ -66,6 +66,7 @@ class CrawlRequest(BaseModel):
    magic: bool = False
    extra: Optional[Dict[str, Any]] = {}
    session_id: Optional[str] = None
+    cache_mode: Optional[CacheMode] = None

@dataclass
 class TaskInfo:
@@ -329,7 +330,7 @@ app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages"

 # API token security
 security = HTTPBearer()
-CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
+CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"

 async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
    if not CRAWL4AI_API_TOKEN:
@@ -419,6 +420,47 @@ async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
    # If we get here, task didn't complete within timeout
    raise HTTPException(status_code=408, detail="Task timed out")

+@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
+async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
+    try:
+        crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
+        extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
+        
+        try:
+            if isinstance(request.urls, list):
+                results = await crawler.arun_many(
+                    urls=[str(url) for url in request.urls],
+                    extraction_strategy=extraction_strategy,
+                    js_code=request.js_code,
+                    wait_for=request.wait_for,
+                    css_selector=request.css_selector,
+                    screenshot=request.screenshot,
+                    magic=request.magic,
+                    cache_mode=request.cache_mode,
+                    session_id=request.session_id,
+                    **request.extra,
+                )
+                return {"results": [result.dict() for result in results]}
+            else:
+                result = await crawler.arun(
+                    url=str(request.urls),
+                    extraction_strategy=extraction_strategy,
+                    js_code=request.js_code,
+                    wait_for=request.wait_for,
+                    css_selector=request.css_selector,
+                    screenshot=request.screenshot,
+                    magic=request.magic,
+                    cache_mode=request.cache_mode,
+                    session_id=request.session_id,
+                    **request.extra,
+                )
+                return {"result": result.dict()}
+        finally:
+            await crawler_service.crawler_pool.release(crawler)
+    except Exception as e:
+        logger.error(f"Error in direct crawl: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+    
@app.get("/health")
 async def health_check():
    available_slots = await crawler_service.resource_monitor.get_available_slots()
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -17,6 +17,7 @@ nav:
    - 'Browser Configuration': 'basic/browser-config.md'
    - 'Page Interaction': 'basic/page-interaction.md'
    - 'Content Selection': 'basic/content-selection.md'
+    - 'Cache Modes': 'basic/cache-modes.md'

  - Advanced:
    - 'Content Processing': 'advanced/content-processing.md'