feat(docs): update examples and documentation to replace bypass_cache with cache_mode for improved clarity

2024-11-17 19:44:45 +08:00
parent a59c107b23
commit df63a40606
17 changed files with 422 additions and 80 deletions
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -8,11 +8,26 @@ The following parameters can be passed to the `arun()` method. They are organize
 await crawler.arun(
    url="https://example.com",   # Required: URL to crawl
    verbose=True,               # Enable detailed logging
-    bypass_cache=False,         # Skip cache for this request
+    cache_mode=CacheMode.ENABLED,  # Control cache behavior
    warmup=True                # Whether to run warmup check
 )
 ```

+## Cache Control
+
+```python
+from crawl4ai import CacheMode
+
+await crawler.arun(
+    cache_mode=CacheMode.ENABLED,    # Normal caching (read/write)
+    # Other cache modes:
+    # cache_mode=CacheMode.DISABLED   # No caching at all
+    # cache_mode=CacheMode.READ_ONLY  # Only read from cache
+    # cache_mode=CacheMode.WRITE_ONLY # Only write to cache
+    # cache_mode=CacheMode.BYPASS     # Skip cache for this operation
+)
+```
+
 ## Content Processing Parameters

 ### Text Processing
@@ -162,14 +177,13 @@ await crawler.arun(

 ## Parameter Interactions and Notes

-1. **Magic Mode Combinations**
+1. **Cache and Performance Setup**
   ```python
-   # Full anti-detection setup
+   # Optimal caching for repeated crawls
   await crawler.arun(
-       magic=True,
-       headless=False,
-       simulate_user=True,
-       override_navigator=True
+       cache_mode=CacheMode.ENABLED,
+       word_count_threshold=10,
+       process_iframes=False
   )
   ```

@@ -179,7 +193,8 @@ await crawler.arun(
   await crawler.arun(
       js_code="window.scrollTo(0, document.body.scrollHeight);",
       wait_for="css:.lazy-content",
-       delay_before_return_html=2.0
+       delay_before_return_html=2.0,
+       cache_mode=CacheMode.WRITE_ONLY  # Cache results after dynamic load
   )
   ```

@@ -192,7 +207,8 @@ await crawler.arun(
       extraction_strategy=my_strategy,
       chunking_strategy=my_chunking,
       process_iframes=True,
-       remove_overlay_elements=True
+       remove_overlay_elements=True,
+       cache_mode=CacheMode.ENABLED
   )
   ```

@@ -201,7 +217,7 @@ await crawler.arun(
 1. **Performance Optimization**
   ```python
   await crawler.arun(
-       bypass_cache=False,           # Use cache when possible
+       cache_mode=CacheMode.ENABLED,  # Use full caching
       word_count_threshold=10,      # Filter out noise
       process_iframes=False         # Skip iframes if not needed
   )
@@ -212,7 +228,8 @@ await crawler.arun(
   await crawler.arun(
       magic=True,                   # Enable anti-detection
       delay_before_return_html=1.0, # Wait for dynamic content
-       page_timeout=60000           # Longer timeout for slow pages
+       page_timeout=60000,          # Longer timeout for slow pages
+       cache_mode=CacheMode.WRITE_ONLY  # Cache results after successful crawl
   )
   ```

@@ -221,6 +238,7 @@ await crawler.arun(
   await crawler.arun(
       remove_overlay_elements=True,  # Remove popups
       excluded_tags=['nav', 'aside'],# Remove unnecessary elements
-       keep_data_attributes=False     # Remove data attributes
+       keep_data_attributes=False,    # Remove data attributes
+       cache_mode=CacheMode.ENABLED   # Use cache for faster processing
   )
   ```
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -20,6 +20,7 @@ class CrawlResult(BaseModel):
    fit_html: Optional[str] = None          # Most relevant HTML content
    markdown: Optional[str] = None          # HTML converted to markdown
    fit_markdown: Optional[str] = None      # Most relevant markdown content
+    downloaded_files: Optional[List[str]] = None  # Downloaded files
    
    # Extracted Data
    extracted_content: Optional[str] = None  # Content from extraction strategy
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -32,4 +32,5 @@
 | async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request |
 | async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse |
 | async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content |
-| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl |
+| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl |
+| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request |