refactor(docs): enhance quickstart_examples.py with improved configuration and file handling

2025-04-05 22:57:45 +08:00
parent ca9351252a
commit 49d904ca0a
1 changed files with 27 additions and 21 deletions
--- a/docs/examples/quickstart_examples.py
+++ b/docs/examples/quickstart_examples.py
@@ -4,12 +4,13 @@ import json
 import base64
 from pathlib import Path
 from typing import List
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai.proxy_strategy import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
 from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 from crawl4ai import LLMConfig
-from crawl4ai import PruningContentFilter
+from crawl4ai import PruningContentFilter, BM25ContentFilter
 from crawl4ai import DefaultMarkdownGenerator
 from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
 from crawl4ai import BrowserConfig
@@ -19,7 +20,12 @@ __cur_dir__ = Path(__file__).parent
 async def demo_basic_crawl():
    """Basic web crawling with markdown generation"""
    print("\n=== 1. Basic Web Crawling ===")
-    async with AsyncWebCrawler() as crawler:
+    async with AsyncWebCrawler(config = BrowserConfig(
+        viewport_height=800,
+        viewport_width=1200,
+        headless=True,
+        verbose=True,
+    )) as crawler:
        results: List[CrawlResult] = await crawler.arun(
            url="https://news.ycombinator.com/"
        )
@@ -281,15 +287,15 @@ async def demo_media_and_links():
                print(f"External link: {link['href']}")

            # # Save everything to files
-            # with open("images.json", "w") as f:
-            #     json.dump(images, f, indent=2)
+            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+                json.dump(images, f, indent=2)

-            # with open("links.json", "w") as f:
-            #     json.dump(
-            #         {"internal": internal_links, "external": external_links},
-            #         f,
-            #         indent=2,
-            #     )
+            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )

 async def demo_screenshot_and_pdf():
    """Capture screenshot and PDF of a page"""
@@ -338,7 +344,7 @@ async def demo_proxy_rotation():

    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
-            proxy_rotation_strategy=proxy_strategy, cache_mode=CacheMode.BYPASS
+            proxy_rotation_strategy=proxy_strategy
        )

        # In a real scenario, these would be run and the proxies would rotate
@@ -386,17 +392,17 @@ async def main():
    print("Note: Some examples require API keys or other configurations")

    # Run all demos
-    # await demo_basic_crawl()
-    # await demo_parallel_crawl()
-    # await demo_fit_markdown()
-    # await demo_llm_structured_extraction_no_schema()
-    # await demo_css_structured_extraction_no_schema()
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
    await demo_deep_crawl()
-    # await demo_js_interaction()
-    # await demo_media_and_links()
-    # await demo_screenshot_and_pdf()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
    # # await demo_proxy_rotation()
-    # await demo_raw_html_and_file()
+    await demo_raw_html_and_file()

    # Clean up any temp files that may have been created
    print("\n=== Demo Complete ===")