Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -1,4 +1,5 @@
 import os, sys
+
 # append the parent directory to the sys.path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
@@ -13,19 +14,18 @@ import json
 from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.content_filter_strategy import BM25ContentFilter

+
 # 1. File Download Processing Example
 async def download_example():
    """Example of downloading files from Python.org"""
    # downloads_path = os.path.join(os.getcwd(), "downloads")
    downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
    os.makedirs(downloads_path, exist_ok=True)
-    
+
    print(f"Downloads will be saved to: {downloads_path}")
-    
+
    async with AsyncWebCrawler(
-        accept_downloads=True,
-        downloads_path=downloads_path,
-        verbose=True
+        accept_downloads=True, downloads_path=downloads_path, verbose=True
    ) as crawler:
        result = await crawler.arun(
            url="https://www.python.org/downloads/",
@@ -40,9 +40,9 @@ async def download_example():
            }
            """,
            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )
-        
+
        if result.downloaded_files:
            print("\nDownload successful!")
            print("Downloaded files:")
@@ -52,25 +52,26 @@ async def download_example():
        else:
            print("\nNo files were downloaded")

+
 # 2. Local File and Raw HTML Processing Example
 async def local_and_raw_html_example():
    """Example of processing local files and raw HTML"""
    # Create a sample HTML file
    sample_file = os.path.join(__data__, "sample.html")
    with open(sample_file, "w") as f:
-        f.write("""
+        f.write(
+            """
        <html><body>
            <h1>Test Content</h1>
            <p>This is a test paragraph.</p>
        </body></html>
-        """)
-    
+        """
+        )
+
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Process local file
-        local_result = await crawler.arun(
-            url=f"file://{os.path.abspath(sample_file)}"
-        )
-        
+        local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
+
        # Process raw HTML
        raw_html = """
        <html><body>
@@ -78,16 +79,15 @@ async def local_and_raw_html_example():
            <p>This is a test of raw HTML processing.</p>
        </body></html>
        """
-        raw_result = await crawler.arun(
-            url=f"raw:{raw_html}"
-        )
-        
+        raw_result = await crawler.arun(url=f"raw:{raw_html}")
+
        # Clean up
        os.remove(sample_file)
-        
+
        print("Local file content:", local_result.markdown)
        print("\nRaw HTML content:", raw_result.markdown)

+
 # 3. Enhanced Markdown Generation Example
 async def markdown_generation_example():
    """Example of enhanced markdown generation with citations and LLM-friendly features"""
@@ -97,58 +97,66 @@ async def markdown_generation_example():
            # user_query="History and cultivation",
            bm25_threshold=1.0
        )
-        
+
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/Apple",
            css_selector="main div#bodyContent",
            content_filter=content_filter,
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )
-        
-        from crawl4ai import AsyncWebCrawler
+
        from crawl4ai.content_filter_strategy import BM25ContentFilter
-        
+
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/Apple",
            css_selector="main div#bodyContent",
-            content_filter=BM25ContentFilter()
+            content_filter=BM25ContentFilter(),
        )
        print(result.markdown_v2.fit_markdown)
-        
+
        print("\nMarkdown Generation Results:")
        print(f"1. Original markdown length: {len(result.markdown)}")
-        print(f"2. New markdown versions (markdown_v2):")
+        print("2. New markdown versions (markdown_v2):")
        print(f"   - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
-        print(f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
-        print(f"   - References section length: {len(result.markdown_v2.references_markdown)}")
+        print(
+            f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
+        )
+        print(
+            f"   - References section length: {len(result.markdown_v2.references_markdown)}"
+        )
        if result.markdown_v2.fit_markdown:
-            print(f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
-        
+            print(
+                f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
+            )
+
        # Save examples to files
        output_dir = os.path.join(__data__, "markdown_examples")
        os.makedirs(output_dir, exist_ok=True)
-        
+
        # Save different versions
        with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
            f.write(result.markdown_v2.raw_markdown)
-            
+
        with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
            f.write(result.markdown_v2.markdown_with_citations)
-            
+
        with open(os.path.join(output_dir, "3_references.md"), "w") as f:
            f.write(result.markdown_v2.references_markdown)
-            
+
        if result.markdown_v2.fit_markdown:
            with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
                f.write(result.markdown_v2.fit_markdown)
-                
+
        print(f"\nMarkdown examples saved to: {output_dir}")
-        
+
        # Show a sample of citations and references
        print("\nSample of markdown with citations:")
        print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
        print("Sample of references:")
-        print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
+        print(
+            "\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
+        )
+

 # 4. Browser Management Example
 async def browser_management_example():
@@ -156,38 +164,38 @@ async def browser_management_example():
    # Use the specified user directory path
    user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
    os.makedirs(user_data_dir, exist_ok=True)
-    
+
    print(f"Browser profile will be saved to: {user_data_dir}")
-    
+
    async with AsyncWebCrawler(
        use_managed_browser=True,
        user_data_dir=user_data_dir,
        headless=False,
-        verbose=True
+        verbose=True,
    ) as crawler:
-
        result = await crawler.arun(
            url="https://crawl4ai.com",
            # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS
-        )        
+            cache_mode=CacheMode.BYPASS,
+        )
        # Use GitHub as an example - it's a good test for browser management
        # because it requires proper browser handling
        result = await crawler.arun(
            url="https://github.com/trending",
            # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )
-        
+
        print("\nBrowser session result:", result.success)
        if result.success:
-            print("Page title:", result.metadata.get('title', 'No title found'))
+            print("Page title:", result.metadata.get("title", "No title found"))
+

 # 5. API Usage Example
 async def api_example():
    """Example of using the new API endpoints"""
-    api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
-    headers = {'Authorization': f'Bearer {api_token}'}    
+    api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+    headers = {"Authorization": f"Bearer {api_token}"}
    async with aiohttp.ClientSession() as session:
        # Submit crawl job
        crawl_request = {
@@ -199,25 +207,17 @@ async def api_example():
                        "name": "Hacker News Articles",
                        "baseSelector": ".athing",
                        "fields": [
-                            {
-                                "name": "title",
-                                "selector": ".title a",
-                                "type": "text"
-                            },
-                            {
-                                "name": "score",
-                                "selector": ".score",
-                                "type": "text"
-                            },
+                            {"name": "title", "selector": ".title a", "type": "text"},
+                            {"name": "score", "selector": ".score", "type": "text"},
                            {
                                "name": "url",
                                "selector": ".title a",
                                "type": "attribute",
-                                "attribute": "href"
-                            }
-                        ]
+                                "attribute": "href",
+                            },
+                        ],
                    }
-                }
+                },
            },
            "crawler_params": {
                "headless": True,
@@ -227,51 +227,50 @@ async def api_example():
            # "screenshot": True,
            # "magic": True
        }
-        
+
        async with session.post(
-            "http://localhost:11235/crawl",
-            json=crawl_request,
-            headers=headers
+            "http://localhost:11235/crawl", json=crawl_request, headers=headers
        ) as response:
            task_data = await response.json()
            task_id = task_data["task_id"]
-            
+
            # Check task status
            while True:
                async with session.get(
-                    f"http://localhost:11235/task/{task_id}",
-                    headers=headers
+                    f"http://localhost:11235/task/{task_id}", headers=headers
                ) as status_response:
                    result = await status_response.json()
                    print(f"Task status: {result['status']}")
-                    
+
                    if result["status"] == "completed":
                        print("Task completed!")
                        print("Results:")
-                        news = json.loads(result["results"][0]['extracted_content'])
+                        news = json.loads(result["results"][0]["extracted_content"])
                        print(json.dumps(news[:4], indent=2))
                        break
                    else:
                        await asyncio.sleep(1)

+
 # Main execution
 async def main():
    # print("Running Crawl4AI feature examples...")
-    
+
    # print("\n1. Running Download Example:")
    # await download_example()
-    
+
    # print("\n2. Running Markdown Generation Example:")
    # await markdown_generation_example()
-    
+
    # # print("\n3. Running Local and Raw HTML Example:")
    # await local_and_raw_html_example()
-    
+
    # # print("\n4. Running Browser Management Example:")
    await browser_management_example()
-    
+
    # print("\n5. Running API Example:")
    await api_example()

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())