refactor(docs): update import paths and clean up example code in quickstart_examples.py

2025-04-05 22:55:56 +08:00
parent 935d9d39f8
commit ca9351252a
1 changed files with 51 additions and 49 deletions
--- a/docs/examples/quickstart_examples.py
+++ b/docs/examples/quickstart_examples.py
@@ -5,7 +5,7 @@ import base64
 from pathlib import Path
 from typing import List
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
-from crawl4ai.configs import ProxyConfig
+from crawl4ai.proxy_strategy import ProxyConfig
 from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 from crawl4ai import LLMConfig
@@ -19,10 +19,9 @@ __cur_dir__ = Path(__file__).parent
 async def demo_basic_crawl():
    """Basic web crawling with markdown generation"""
    print("\n=== 1. Basic Web Crawling ===")
-
    async with AsyncWebCrawler() as crawler:
        results: List[CrawlResult] = await crawler.arun(
-            url="https://news.ycombinator.com/",
+            url="https://news.ycombinator.com/"
        )

        for i, result in enumerate(results):
@@ -34,7 +33,6 @@ async def demo_basic_crawl():
            else:
                print("Failed to crawl the URL")

-
 async def demo_parallel_crawl():
    """Crawl multiple URLs in parallel"""
    print("\n=== 2. Parallel Crawling ===")
@@ -56,14 +54,13 @@ async def demo_parallel_crawl():
                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
            )

-
 async def demo_fit_markdown():
    """Generate focused markdown with LLM content filter"""
    print("\n=== 3. Fit Markdown with LLM Content Filter ===")

    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            "https://en.wikipedia.org/wiki/Python_(programming_language)",
+        result: CrawlResult = await crawler.arun(
+            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
            config=CrawlerRunConfig(
                markdown_generator=DefaultMarkdownGenerator(
                    content_filter=PruningContentFilter()
@@ -75,7 +72,6 @@ async def demo_fit_markdown():
        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
        print(f"Fit: {len(result.markdown.fit_markdown)} chars")

-
 async def demo_llm_structured_extraction_no_schema():
    # Create a simple LLM extraction strategy (no schema required)
    extraction_strategy = LLMExtractionStrategy(
@@ -83,7 +79,7 @@ async def demo_llm_structured_extraction_no_schema():
            provider="groq/qwen-2.5-32b",
            api_token="env:GROQ_API_KEY",
        ),
-        instruction="This is news.ycombinator.com, extract all news for each. title, source url, number of comments.",
+        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
        extract_type="schema",
        schema="{title: string, url: string, comments: int}",
        extra_args={
@@ -109,7 +105,6 @@ async def demo_llm_structured_extraction_no_schema():
            else:
                print("Failed to extract structured data")

-
 async def demo_css_structured_extraction_no_schema():
    """Extract structured data using CSS selectors"""
    print("\n=== 5. CSS-Based Structured Extraction ===")
@@ -129,27 +124,33 @@ async def demo_css_structured_extraction_no_schema():
                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
                    <span class="h-tags">Malware / Supply Chain Attack</span>
                </div>
-                <div class="home-desc"> Cybersecurity researchers have uncovered malicious libraries in the Python Package Index (PyPI) repository that are designed to steal sensitive information.  Two of the packages, bitcoinlibdbfix and bitcoinlib-dev, masquerade as fixes for recent issues  detected in a legitimate Python module called bitcoinlib, according to ReversingLabs . A third package discovered  by Socket, disgrasya, contained a fully automated carding script targeting WooCommerce stores.  The packages attracted hundreds of downloads before being taken down, according to statistics from pepy.tech -   bitcoinlibdbfix  - 1,101 downloads  bitcoinlib-dev  - 735 downloads  disgrasya  - 37,217 downloads   "The malicious libraries both attempt a similar attack, overwriting the legitimate 'clw cli' command with malicious code that attempts to exfiltrate sensitive database files," ReversingLabs said.   In an interesting twist, the authors of the counterfeit libraries are said to have joined a GitHub issue...</div>
+                <div class="home-desc"> Cybersecurity researchers have...</div>
            </div>
        </div>
    </a>
 </div>
    """

-    # Generate schema using LLM (one-time setup)
-    schema = JsonCssExtractionStrategy.generate_schema(
-        html=sample_html,
-        llm_config=LLMConfig(
-            provider="groq/qwen-2.5-32b",
-            api_token="env:GROQ_API_KEY",
-        ),
-        query="From https://thehackernews.com/, I have shares a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
-    )
+    # Check if schema file exists
+    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+    if os.path.exists(schema_file_path):
+        with open(schema_file_path, "r") as f:
+            schema = json.load(f)
+    else:
+        # Generate schema using LLM (one-time setup)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            llm_config=LLMConfig(
+                provider="groq/qwen-2.5-32b",
+                api_token="env:GROQ_API_KEY",
+            ),
+            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+        )

    print(f"Generated schema: {json.dumps(schema, indent=2)}")
    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
-    # with open("schema.json", "w") as f:
-    #     json.dump(schema, f, indent=2)
+    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+        json.dump(schema, f, indent=2)

    # Create no-LLM extraction strategy with the generated schema
    extraction_strategy = JsonCssExtractionStrategy(schema)
@@ -170,7 +171,6 @@ async def demo_css_structured_extraction_no_schema():
            else:
                print("Failed to extract structured data")

-
 async def demo_deep_crawl():
    """Deep crawling with BFS strategy"""
    print("\n=== 6. Deep Crawling ===")
@@ -192,7 +192,6 @@ async def demo_deep_crawl():
            depth = result.metadata.get("depth", "unknown")
            print(f"  {i + 1}. {result.url} (Depth: {depth})")

-
 async def demo_js_interaction():
    """Execute JavaScript to load more content"""
    print("\n=== 7. JavaScript Interaction ===")
@@ -255,8 +254,6 @@ async def demo_js_interaction():
                print("Failed to extract structured data")
        print(f"Total items: {len(news)}")

-
-
 async def demo_media_and_links():
    """Extract media and links from a page"""
    print("\n=== 8. Media and Links Extraction ===")
@@ -275,17 +272,24 @@ async def demo_media_and_links():
            print(f"Found {len(internal_links)} internal links")
            print(f"Found {len(external_links)} external links")

-            # Save everything to files
-            with open("images.json", "w") as f:
-                json.dump(images, f, indent=2)
+            # Print some of the images and links
+            for image in images[:3]:
+                print(f"Image: {image['src']}")
+            for link in internal_links[:3]:
+                print(f"Internal link: {link['href']}")
+            for link in external_links[:3]:
+                print(f"External link: {link['href']}")

-            with open("links.json", "w") as f:
-                json.dump(
-                    {"internal": internal_links, "external": external_links},
-                    f,
-                    indent=2,
-                )
+            # # Save everything to files
+            # with open("images.json", "w") as f:
+            #     json.dump(images, f, indent=2)

+            # with open("links.json", "w") as f:
+            #     json.dump(
+            #         {"internal": internal_links, "external": external_links},
+            #         f,
+            #         indent=2,
+            #     )

 async def demo_screenshot_and_pdf():
    """Capture screenshot and PDF of a page"""
@@ -299,6 +303,7 @@ async def demo_screenshot_and_pdf():
        )

        for i, result in enumerate(result):
+            # if result.screenshot_data:
            if result.screenshot:
                # Save screenshot
                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
@@ -306,6 +311,7 @@ async def demo_screenshot_and_pdf():
                    f.write(base64.b64decode(result.screenshot))
                print(f"Screenshot saved to {screenshot_path}")

+            # if result.pdf_data:
            if result.pdf:
                # Save PDF
                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
@@ -313,7 +319,6 @@ async def demo_screenshot_and_pdf():
                    f.write(result.pdf)
                print(f"PDF saved to {pdf_path}")

-
 async def demo_proxy_rotation():
    """Proxy rotation for multiple requests"""
    print("\n=== 10. Proxy Rotation ===")
@@ -339,7 +344,6 @@ async def demo_proxy_rotation():
        # In a real scenario, these would be run and the proxies would rotate
        print("In a real scenario, requests would rotate through the available proxies")

-
 async def demo_raw_html_and_file():
    """Process raw HTML and local files"""
    print("\n=== 11. Raw HTML and Local Files ===")
@@ -376,29 +380,27 @@ async def demo_raw_html_and_file():
    os.remove(file_path)
    print(f"Processed both raw HTML and local file ({file_path})")

-
 async def main():
    """Run all demo functions sequentially"""
    print("=== Comprehensive Crawl4AI Demo ===")
    print("Note: Some examples require API keys or other configurations")

    # Run all demos
-    await demo_basic_crawl()
-    await demo_parallel_crawl()
-    await demo_fit_markdown()
-    await demo_llm_structured_extraction_no_schema()
-    await demo_css_structured_extraction_no_schema()
+    # await demo_basic_crawl()
+    # await demo_parallel_crawl()
+    # await demo_fit_markdown()
+    # await demo_llm_structured_extraction_no_schema()
+    # await demo_css_structured_extraction_no_schema()
    await demo_deep_crawl()
-    await demo_js_interaction()
-    await demo_media_and_links()
-    await demo_screenshot_and_pdf()
-    # await demo_proxy_rotation()
-    await demo_raw_html_and_file()
+    # await demo_js_interaction()
+    # await demo_media_and_links()
+    # await demo_screenshot_and_pdf()
+    # # await demo_proxy_rotation()
+    # await demo_raw_html_and_file()

    # Clean up any temp files that may have been created
    print("\n=== Demo Complete ===")
    print("Check for any generated files (screenshots, PDFs) in the current directory")

-
 if __name__ == "__main__":
    asyncio.run(main())