Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -1,6 +1,10 @@
 import os, sys
+
 # append parent directory to system path
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692";
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"

 import asyncio
 # import nest_asyncio
@@ -15,7 +19,7 @@ from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
@@ -32,9 +36,12 @@ print("Website: https://crawl4ai.com")
 async def simple_crawl():
    print("\n--- Basic Usage ---")
    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])  # Print first 500 characters

+
 async def simple_example_with_running_js_code():
    print("\n--- Executing JavaScript and Using CSS Selectors ---")
    # New code to handle the wait_for parameter
@@ -57,6 +64,7 @@ async def simple_example_with_running_js_code():
        )
        print(result.markdown[:500])  # Print first 500 characters

+
 async def simple_example_with_css_selector():
    print("\n--- Using CSS Selectors ---")
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -67,42 +75,44 @@ async def simple_example_with_css_selector():
        )
        print(result.markdown[:500])  # Print first 500 characters

+
 async def use_proxy():
    print("\n--- Using a Proxy ---")
    print(
        "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
    )
    # Uncomment and modify the following lines to use a proxy
-    async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
+    async with AsyncWebCrawler(
+        verbose=True, proxy="http://your-proxy-url:port"
+    ) as crawler:
        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            cache_mode= CacheMode.BYPASS
+            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
        )
        if result.success:
            print(result.markdown[:500])  # Print first 500 characters

+
 async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
-            url=url,
-            screenshot=True,
-            cache_mode= CacheMode.BYPASS
+            url=url, screenshot=True, cache_mode=CacheMode.BYPASS
        )
-        
+
        if result.success and result.screenshot:
            import base64
-            
+
            # Decode the base64 screenshot data
            screenshot_data = base64.b64decode(result.screenshot)
-            
+
            # Save the screenshot as a JPEG file
-            with open(output_path, 'wb') as f:
+            with open(output_path, "wb") as f:
                f.write(screenshot_data)
-            
+
            print(f"Screenshot saved successfully to {output_path}")
        else:
            print("Failed to capture screenshot")

+
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
@@ -110,16 +120,19 @@ class OpenAIModelFee(BaseModel):
        ..., description="Fee for output token for the OpenAI model."
    )

-async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
    print(f"\n--- Extracting Structured Data with {provider} ---")
-    
+
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    # extra_args = {}
-    extra_args={
-        "temperature": 0, 
+    extra_args = {
+        "temperature": 0,
        "top_p": 0.9,
        "max_tokens": 2000,
        # any other supported parameters for litellm
@@ -139,52 +152,49 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
-                extra_args=extra_args
+                extra_args=extra_args,
            ),
            cache_mode=CacheMode.BYPASS,
        )
        print(result.extracted_content)

+
 async def extract_structured_data_using_css_extractor():
    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
    schema = {
-    "name": "KidoCode Courses",
-    "baseSelector": "section.charge-methodology .w-tab-content > div",
-    "fields": [
-        {
-            "name": "section_title",
-            "selector": "h3.heading-50",
-            "type": "text",
-        },
-        {
-            "name": "section_description",
-            "selector": ".charge-content",
-            "type": "text",
-        },
-        {
-            "name": "course_name",
-            "selector": ".text-block-93",
-            "type": "text",
-        },
-        {
-            "name": "course_description",
-            "selector": ".course-content-text",
-            "type": "text",
-        },
-        {
-            "name": "course_icon",
-            "selector": ".image-92",
-            "type": "attribute",
-            "attribute": "src"
-        }
-    ]
-}
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }

-    async with AsyncWebCrawler(
-        headless=True,
-        verbose=True
-    ) as crawler:
-        
+    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
        # Create the JavaScript that handles clicking multiple times
        js_click_tabs = """
        (async () => {
@@ -198,19 +208,20 @@ async def extract_structured_data_using_css_extractor():
                await new Promise(r => setTimeout(r, 500));
            }
        })();
-        """     
+        """

        result = await crawler.arun(
            url="https://www.kidocode.com/degrees/technology",
            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
            js_code=[js_click_tabs],
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )

        companies = json.loads(result.extracted_content)
        print(f"Successfully extracted {len(companies)} companies")
        print(json.dumps(companies[0], indent=2))

+
 # Advanced Session-Based Crawling with Dynamic Content 🔄
 async def crawl_dynamic_content_pages_method_1():
    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
@@ -267,6 +278,7 @@ async def crawl_dynamic_content_pages_method_1():
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
 async def crawl_dynamic_content_pages_method_2():
    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")

@@ -334,8 +346,11 @@ async def crawl_dynamic_content_pages_method_2():
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
 async def crawl_dynamic_content_pages_method_3():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---")
+    print(
+        "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
+    )

    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://github.com/microsoft/TypeScript/commits/main"
@@ -357,7 +372,7 @@ async def crawl_dynamic_content_pages_method_3():
            const firstCommit = commits[0].textContent.trim();
            return firstCommit !== window.firstCommit;
        }"""
-        
+
        schema = {
            "name": "Commit Extractor",
            "baseSelector": "li.Box-sc-g0xbh4-0",
@@ -395,40 +410,53 @@ async def crawl_dynamic_content_pages_method_3():
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
 async def crawl_custom_browser_type():
    # Use Firefox
    start = time.time()
-    async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(
+        browser_type="firefox", verbose=True, headless=True
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

    # Use WebKit
    start = time.time()
-    async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(
+        browser_type="webkit", verbose=True, headless=True
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

    # Use Chromium (default)
    start = time.time()
-    async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

+
 async def crawl_with_user_simultion():
    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
        url = "YOUR-URL-HERE"
        result = await crawler.arun(
-            url=url,            
+            url=url,
            cache_mode=CacheMode.BYPASS,
-            magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
+            magic=True,  # Automatically detects and removes overlays, popups, and other elements that block content
            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
            # override_navigator = True # Overrides the navigator object to make it look like a real user
        )
-        
-        print(result.markdown)    
+
+        print(result.markdown)
+

 async def speed_comparison():
    # print("\n--- Speed Comparison ---")
@@ -439,18 +467,18 @@ async def speed_comparison():
    # print()
    # Simulated Firecrawl performance
    from firecrawl import FirecrawlApp
-    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+
+    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
    start = time.time()
    scrape_status = app.scrape_url(
-    'https://www.nbcnews.com/business',
-    params={'formats': ['markdown', 'html']}
+        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
    )
    end = time.time()
    print("Firecrawl:")
    print(f"Time taken: {end - start:.2f} seconds")
    print(f"Content length: {len(scrape_status['markdown'])} characters")
    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()    
+    print()

    async with AsyncWebCrawler() as crawler:
        # Crawl4AI simple crawl
@@ -474,7 +502,9 @@ async def speed_comparison():
            url="https://www.nbcnews.com/business",
            word_count_threshold=0,
            markdown_generator=DefaultMarkdownGenerator(
-                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
            ),
            cache_mode=CacheMode.BYPASS,
@@ -498,7 +528,9 @@ async def speed_comparison():
            word_count_threshold=0,
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
-                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
            ),
            verbose=False,
@@ -520,11 +552,12 @@ async def speed_comparison():
    print("If you run these tests in an environment with better network conditions,")
    print("you may observe an even more significant speed advantage for Crawl4AI.")

+
 async def generate_knowledge_graph():
    class Entity(BaseModel):
        name: str
        description: str
-        
+
    class Relationship(BaseModel):
        entity1: Entity
        entity2: Entity
@@ -536,11 +569,11 @@ async def generate_knowledge_graph():
        relationships: List[Relationship]

    extraction_strategy = LLMExtractionStrategy(
-            provider='openai/gpt-4o-mini', # Or any other provider, including Ollama and open source models
-            api_token=os.getenv('OPENAI_API_KEY'), # In case of Ollama just pass "no-token"
-            schema=KnowledgeGraph.model_json_schema(),
-            extraction_type="schema",
-            instruction="""Extract entities and relationships from the given text."""
+        provider="openai/gpt-4o-mini",  # Or any other provider, including Ollama and open source models
+        api_token=os.getenv("OPENAI_API_KEY"),  # In case of Ollama just pass "no-token"
+        schema=KnowledgeGraph.model_json_schema(),
+        extraction_type="schema",
+        instruction="""Extract entities and relationships from the given text.""",
    )
    async with AsyncWebCrawler() as crawler:
        url = "https://paulgraham.com/love.html"
@@ -554,27 +587,22 @@ async def generate_knowledge_graph():
        with open(os.path.join(__location__, "kb.json"), "w") as f:
            f.write(result.extracted_content)

+
 async def fit_markdown_remove_overlay():
-    
    async with AsyncWebCrawler(
-            headless=True,  # Set to False to see what is happening
-            verbose=True,
-            user_agent_mode="random",
-            user_agent_generator_config={
-                "device_type": "mobile",
-                "os_type": "android"
-            },
+        headless=True,  # Set to False to see what is happening
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
    ) as crawler:
        result = await crawler.arun(
-            url='https://www.kidocode.com/degrees/technology',
+            url="https://www.kidocode.com/degrees/technology",
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter(
                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
                ),
-                options={
-                    "ignore_links": True
-                }
+                options={"ignore_links": True},
            ),
            # markdown_generator=DefaultMarkdownGenerator(
            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
@@ -583,31 +611,38 @@ async def fit_markdown_remove_overlay():
            #     }
            # ),
        )
-        
+
        if result.success:
            print(len(result.markdown_v2.raw_markdown))
            print(len(result.markdown_v2.markdown_with_citations))
            print(len(result.markdown_v2.fit_markdown))
-            
+
            # Save clean html
            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
                f.write(result.cleaned_html)
-            
-            with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
+
+            with open(
+                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
+            ) as f:
                f.write(result.markdown_v2.raw_markdown)
-                
-            with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
-                f.write(result.markdown_v2.markdown_with_citations) 
-                
-            with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:   
+
+            with open(
+                os.path.join(__location__, "output/output_markdown_with_citations.md"),
+                "w",
+            ) as f:
+                f.write(result.markdown_v2.markdown_with_citations)
+
+            with open(
+                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
+            ) as f:
                f.write(result.markdown_v2.fit_markdown)
-        
+
    print("Done")


 async def main():
    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-    
+
    # await simple_crawl()
    # await simple_example_with_running_js_code()
    # await simple_example_with_css_selector()
@@ -618,7 +653,7 @@ async def main():
    # LLM extraction examples
    # await extract_structured_data_using_llm()
    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    # await extract_structured_data_using_llm("ollama/llama3.2")    
+    # await extract_structured_data_using_llm("ollama/llama3.2")

    # You always can pass custom headers to the extraction strategy
    # custom_headers = {
@@ -626,13 +661,13 @@ async def main():
    #     "X-Custom-Header": "Some-Value"
    # }
    # await extract_structured_data_using_llm(extra_headers=custom_headers)
-    
+
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    await crawl_dynamic_content_pages_method_3()
-    
+
    # await crawl_custom_browser_type()
-    
+
    # await speed_comparison()