feat: Enhance crawler flexibility and LLM extraction capabilities

- Add browser type selection (Chromium, Firefox, WebKit) - Implement iframe content extraction - Improve image processing and dimension updates - Add custom headers support in AsyncPlaywrightCrawlerStrategy - Enhance delayed content retrieval with new parameter - Optimize HTML sanitization and Markdown conversion - Update examples in quickstart_async.py for new features
2024-10-14 21:03:28 +08:00
parent b9bbd42373
commit 320afdea64
7 changed files with 238 additions and 93 deletions
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -357,6 +357,28 @@ async def crawl_dynamic_content_pages_method_3():
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+async def crawl_custom_browser_type():
+    # Use Firefox
+    start = time.time()
+    async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
+        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        print(result.markdown[:500])
+        print("Time taken: ", time.time() - start)
+
+    # Use WebKit
+    start = time.time()
+    async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
+        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        print(result.markdown[:500])
+        print("Time taken: ", time.time() - start)
+
+    # Use Chromium (default)
+    start = time.time()
+    async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
+        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        print(result.markdown[:500])
+        print("Time taken: ", time.time() - start)
+
 async def speed_comparison():
    # print("\n--- Speed Comparison ---")
    # print("Firecrawl (simulated):")
@@ -446,6 +468,9 @@ async def main():
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    await crawl_dynamic_content_pages_method_3()
+    
+    await crawl_custom_browser_type()
+    
    await speed_comparison()