feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control

2024-11-17 17:17:34 +08:00
parent 3a524a3bdd
commit 2a82455b3d
4 changed files with 153 additions and 6 deletions
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -9,7 +9,7 @@ from typing import Dict, Any
 class Crawl4AiTester:
    def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
        self.base_url = base_url
-        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN')  # Check environment variable as fallback
+        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"  # Check environment variable as fallback
        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
        
    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
@@ -44,6 +44,16 @@ class Crawl4AiTester:
            raise TimeoutError("Task did not complete within server timeout")
        response.raise_for_status()
        return response.json()
+    
+    def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Directly crawl without using task queue"""
+        response = requests.post(
+            f"{self.base_url}/crawl_direct", 
+            json=request_data, 
+            headers=self.headers
+        )
+        response.raise_for_status()
+        return response.json()

 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester(
@@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"):
            time.sleep(5)
    
    # Test cases based on version
-    test_basic_crawl(tester)
-    test_basic_crawl(tester)
-    test_basic_crawl_sync(tester)
+    # test_basic_crawl(tester)
+    # test_basic_crawl(tester)
+    # test_basic_crawl_sync(tester)
+    test_basic_crawl_direct(tester)
    
    # if version in ["full", "transformer"]:
    #     test_cosine_extraction(tester)
@@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
    assert result['result']['success']
    assert len(result['result']['markdown']) > 0
    
+def test_basic_crawl_direct(tester: Crawl4AiTester):
+    print("\n=== Testing Basic Crawl (Direct) ===")
+    request = {
+        "urls": "https://www.nbcnews.com/business",
+        "priority": 10,
+        # "session_id": "test"
+        "cache_mode": "bypass"  # or "enabled", "disabled", "read_only", "write_only"
+    }
+    
+    result = tester.crawl_direct(request)
+    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
+    assert result['result']['success']
+    assert len(result['result']['markdown']) > 0
+    
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
--- a/docs/md_v2/basic/cache-modes.md
+++ b/docs/md_v2/basic/cache-modes.md
@@ -0,0 +1,79 @@
+# Crawl4AI Cache System and Migration Guide
+
+## Overview
+Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
+
+## Old vs New Approach
+
+### Old Way (Deprecated)
+The old system used multiple boolean flags:
+- `bypass_cache`: Skip cache entirely
+- `disable_cache`: Disable all caching
+- `no_cache_read`: Don't read from cache
+- `no_cache_write`: Don't write to cache
+
+### New Way (Recommended)
+The new system uses a single `CacheMode` enum:
+- `CacheMode.ENABLED`: Normal caching (read/write)
+- `CacheMode.DISABLED`: No caching at all
+- `CacheMode.READ_ONLY`: Only read from cache
+- `CacheMode.WRITE_ONLY`: Only write to cache
+- `CacheMode.BYPASS`: Skip cache for this operation
+
+## Migration Example
+
+### Old Code (Deprecated)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def use_proxy():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            bypass_cache=True  # Old way
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### New Code (Recommended)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode  # Import CacheMode
+
+async def use_proxy():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            cache_mode=CacheMode.BYPASS  # New way
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Common Migration Patterns
+
+Old Flag | New Mode
+---------|----------
+`bypass_cache=True` | `cache_mode=CacheMode.BYPASS`
+`disable_cache=True` | `cache_mode=CacheMode.DISABLED`
+`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY`
+`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY`
+
+## Suppressing Deprecation Warnings
+If you need time to migrate, you can temporarily suppress deprecation warnings:
+```python
+# In your config.py
+SHOW_DEPRECATION_WARNINGS = False
+```