From 2a82455b3dd3427f3099e201c2d88fadcc0c78fc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 17:17:34 +0800 Subject: [PATCH] feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control --- docs/examples/docker_example.py | 33 ++++++++++++-- docs/md_v2/basic/cache-modes.md | 79 +++++++++++++++++++++++++++++++++ main.py | 46 ++++++++++++++++++- mkdocs.yml | 1 + 4 files changed, 153 insertions(+), 6 deletions(-) create mode 100644 docs/md_v2/basic/cache-modes.md diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 898f14da..17ef9f04 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -9,7 +9,7 @@ from typing import Dict, Any class Crawl4AiTester: def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): self.base_url = base_url - self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: @@ -44,6 +44,16 @@ class Crawl4AiTester: raise TimeoutError("Task did not complete within server timeout") response.raise_for_status() return response.json() + + def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + """Directly crawl without using task queue""" + response = requests.post( + f"{self.base_url}/crawl_direct", + json=request_data, + headers=self.headers + ) + response.raise_for_status() + return response.json() def test_docker_deployment(version="basic"): tester = Crawl4AiTester( @@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - test_basic_crawl(tester) - test_basic_crawl(tester) - test_basic_crawl_sync(tester) + # test_basic_crawl(tester) + # test_basic_crawl(tester) + # test_basic_crawl_sync(tester) + test_basic_crawl_direct(tester) # if version in ["full", "transformer"]: # test_cosine_extraction(tester) @@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): assert result['result']['success'] assert len(result['result']['markdown']) > 0 +def test_basic_crawl_direct(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Direct) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + # "session_id": "test" + "cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only" + } + + result = tester.crawl_direct(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md new file mode 100644 index 00000000..04a4f218 --- /dev/null +++ b/docs/md_v2/basic/cache-modes.md @@ -0,0 +1,79 @@ +# Crawl4AI Cache System and Migration Guide + +## Overview +Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. + +## Old vs New Approach + +### Old Way (Deprecated) +The old system used multiple boolean flags: +- `bypass_cache`: Skip cache entirely +- `disable_cache`: Disable all caching +- `no_cache_read`: Don't read from cache +- `no_cache_write`: Don't write to cache + +### New Way (Recommended) +The new system uses a single `CacheMode` enum: +- `CacheMode.ENABLED`: Normal caching (read/write) +- `CacheMode.DISABLED`: No caching at all +- `CacheMode.READ_ONLY`: Only read from cache +- `CacheMode.WRITE_ONLY`: Only write to cache +- `CacheMode.BYPASS`: Skip cache for this operation + +## Migration Example + +### Old Code (Deprecated) +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True # Old way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### New Code (Recommended) +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + cache_mode=CacheMode.BYPASS # New way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Common Migration Patterns + +Old Flag | New Mode +---------|---------- +`bypass_cache=True` | `cache_mode=CacheMode.BYPASS` +`disable_cache=True` | `cache_mode=CacheMode.DISABLED` +`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` +`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` + +## Suppressing Deprecation Warnings +If you need time to migrate, you can temporarily suppress deprecation warnings: +```python +# In your config.py +SHOW_DEPRECATION_WARNINGS = False +``` diff --git a/main.py b/main.py index 41788d61..ee5f7fc6 100644 --- a/main.py +++ b/main.py @@ -25,7 +25,7 @@ import logging from enum import Enum from dataclasses import dataclass import json -from crawl4ai import AsyncWebCrawler, CrawlResult +from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, @@ -66,6 +66,7 @@ class CrawlRequest(BaseModel): magic: bool = False extra: Optional[Dict[str, Any]] = {} session_id: Optional[str] = None + cache_mode: Optional[CacheMode] = None @dataclass class TaskInfo: @@ -329,7 +330,7 @@ app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages" # API token security security = HTTPBearer() -CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): if not CRAWL4AI_API_TOKEN: @@ -419,6 +420,47 @@ async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: # If we get here, task didn't complete within timeout raise HTTPException(status_code=408, detail="Task timed out") +@app.post("/crawl_direct", dependencies=[Depends(verify_token)]) +async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: + try: + crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) + extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config) + + try: + if isinstance(request.urls, list): + results = await crawler.arun_many( + urls=[str(url) for url in request.urls], + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"results": [result.dict() for result in results]} + else: + result = await crawler.arun( + url=str(request.urls), + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"result": result.dict()} + finally: + await crawler_service.crawler_pool.release(crawler) + except Exception as e: + logger.error(f"Error in direct crawl: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + @app.get("/health") async def health_check(): available_slots = await crawler_service.resource_monitor.get_available_slots() diff --git a/mkdocs.yml b/mkdocs.yml index b09cb9eb..1b26b9df 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,6 +17,7 @@ nav: - 'Browser Configuration': 'basic/browser-config.md' - 'Page Interaction': 'basic/page-interaction.md' - 'Content Selection': 'basic/content-selection.md' + - 'Cache Modes': 'basic/cache-modes.md' - Advanced: - 'Content Processing': 'advanced/content-processing.md'