refactor: update cache handling in quickstart_async example to use CacheMode enum

2024-11-28 15:53:25 +08:00
parent 3abb573142
commit d583aa43ca
2 changed files with 296 additions and 269 deletions
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -13,7 +13,9 @@ import re
 from typing import Dict, List
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter
 from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
@@ -51,7 +53,7 @@ async def simple_example_with_running_js_code():
            url="https://www.nbcnews.com/business",
            js_code=js_code,
            # wait_for=wait_for,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
        )
        print(result.markdown[:500])  # Print first 500 characters

@@ -61,7 +63,7 @@ async def simple_example_with_css_selector():
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            css_selector=".wide-tease-item__description",
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
        )
        print(result.markdown[:500])  # Print first 500 characters

@@ -132,7 +134,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
                extra_args=extra_args
            ),
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
        )
        print(result.extracted_content)

@@ -166,7 +168,7 @@ async def extract_structured_data_using_css_extractor():
        result = await crawler.arun(
            url="https://www.coinbase.com/explore",
            extraction_strategy=extraction_strategy,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
        )

        assert result.success, "Failed to crawl the page"
@@ -213,7 +215,7 @@ async def crawl_dynamic_content_pages_method_1():
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
                js=js_next_page if page > 0 else None,
-                bypass_cache=True,
+                cache_mode=CacheMode.BYPASS,
                js_only=page > 0,
                headless=False,
            )
@@ -282,7 +284,7 @@ async def crawl_dynamic_content_pages_method_2():
                extraction_strategy=extraction_strategy,
                js_code=js_next_page_and_wait if page > 0 else None,
                js_only=page > 0,
-                bypass_cache=True,
+                cache_mode=CacheMode.BYPASS,
                headless=False,
            )

@@ -343,7 +345,7 @@ async def crawl_dynamic_content_pages_method_3():
                js_code=js_next_page if page > 0 else None,
                wait_for=wait_for if page > 0 else None,
                js_only=page > 0,
-                bypass_cache=True,
+                cache_mode=CacheMode.BYPASS,
                headless=False,
            )

@@ -384,7 +386,7 @@ async def crawl_with_user_simultion():
        url = "YOUR-URL-HERE"
        result = await crawler.arun(
            url=url,            
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
            magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
            # override_navigator = True # Overrides the navigator object to make it look like a real user
@@ -408,7 +410,7 @@ async def speed_comparison():
    params={'formats': ['markdown', 'html']}
    )
    end = time.time()
-    print("Firecrawl (simulated):")
+    print("Firecrawl:")
    print(f"Time taken: {end - start:.2f} seconds")
    print(f"Content length: {len(scrape_status['markdown'])} characters")
    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
@@ -420,7 +422,7 @@ async def speed_comparison():
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            word_count_threshold=0,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
            verbose=False,
        )
        end = time.time()
@@ -430,6 +432,25 @@ async def speed_comparison():
        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
        print()

+        # Crawl4AI with advanced content filtering
+        start = time.time()
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            word_count_threshold=0,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
+            ),
+            cache_mode=CacheMode.BYPASS,
+            verbose=False,
+        )
+        end = time.time()
+        print("Crawl4AI (Markdown Plus):")
+        print(f"Time taken: {end - start:.2f} seconds")
+        print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
+        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print()
+
        # Crawl4AI with JavaScript execution
        start = time.time()
        result = await crawler.arun(
@@ -438,13 +459,17 @@ async def speed_comparison():
                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
            ],
            word_count_threshold=0,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
+            ),
            verbose=False,
        )
        end = time.time()
        print("Crawl4AI (with JavaScript execution):")
        print(f"Time taken: {end - start:.2f} seconds")
        print(f"Content length: {len(result.markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")

    print("\nNote on Speed Comparison:")
@@ -483,7 +508,7 @@ async def generate_knowledge_graph():
        url = "https://paulgraham.com/love.html"
        result = await crawler.arun(
            url=url,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
            extraction_strategy=extraction_strategy,
            # magic=True
        )
@@ -496,7 +521,7 @@ async def fit_markdown_remove_overlay():
        url = "https://janineintheworld.com/places-to-visit-in-central-mexico"
        result = await crawler.arun(
            url=url,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
            word_count_threshold = 10,
            remove_overlay_elements=True,
            screenshot = True
@@ -509,31 +534,31 @@ async def fit_markdown_remove_overlay():


 async def main():
-    await simple_crawl()
-    await simple_example_with_running_js_code()
-    await simple_example_with_css_selector()
-    await use_proxy()
-    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    await extract_structured_data_using_css_extractor()
+    # await simple_crawl()
+    # await simple_example_with_running_js_code()
+    # await simple_example_with_css_selector()
+    # await use_proxy()
+    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    # await extract_structured_data_using_css_extractor()

-    # LLM extraction examples
-    await extract_structured_data_using_llm()
-    await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-    await extract_structured_data_using_llm("ollama/llama3.2")    
+    # # LLM extraction examples
+    # await extract_structured_data_using_llm()
+    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
+    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    # await extract_structured_data_using_llm("ollama/llama3.2")    

-    # You always can pass custom headers to the extraction strategy
-    custom_headers = {
-        "Authorization": "Bearer your-custom-token",
-        "X-Custom-Header": "Some-Value"
-    }
-    await extract_structured_data_using_llm(extra_headers=custom_headers)
+    # # You always can pass custom headers to the extraction strategy
+    # custom_headers = {
+    #     "Authorization": "Bearer your-custom-token",
+    #     "X-Custom-Header": "Some-Value"
+    # }
+    # await extract_structured_data_using_llm(extra_headers=custom_headers)
    
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
-    await crawl_dynamic_content_pages_method_3()
+    # # await crawl_dynamic_content_pages_method_1()
+    # # await crawl_dynamic_content_pages_method_2()
+    # await crawl_dynamic_content_pages_method_3()
    
-    await crawl_custom_browser_type()
+    # await crawl_custom_browser_type()
    
    await speed_comparison()