Enhance AsyncWebCrawler with smart waiting and screenshot capabilities

- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler - Improve error handling and timeout management in crawling process - Fix typo in CrawlResult model (responser_headers -> response_headers) - Update .gitignore to exclude additional files - Adjust import path in test_basic_crawling.py
2024-10-02 17:34:56 +08:00
parent e0e0db4247
commit 4750810a67
10 changed files with 281 additions and 21 deletions
--- a/docs/examples/async_webcrawler_multiple_urls_example.py
+++ b/docs/examples/async_webcrawler_multiple_urls_example.py
@@ -0,0 +1,48 @@
+# File: async_webcrawler_multiple_urls_example.py
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(parent_dir)
+
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    # Initialize the AsyncWebCrawler
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # List of URLs to crawl
+        urls = [
+            "https://example.com",
+            "https://python.org",
+            "https://github.com",
+            "https://stackoverflow.com",
+            "https://news.ycombinator.com"
+        ]
+
+        # Set up crawling parameters
+        word_count_threshold = 100
+
+        # Run the crawling process for multiple URLs
+        results = await crawler.arun_many(
+            urls=urls,
+            word_count_threshold=word_count_threshold,
+            bypass_cache=True,
+            verbose=True
+        )
+
+        # Process the results
+        for result in results:
+            if result.success:
+                print(f"Successfully crawled: {result.url}")
+                print(f"Title: {result.metadata.get('title', 'N/A')}")
+                print(f"Word count: {len(result.markdown.split())}")
+                print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
+                print(f"Number of images: {len(result.media.get('images', []))}")
+                print("---")
+            else:
+                print(f"Failed to crawl: {result.url}")
+                print(f"Error: {result.error_message}")
+                print("---")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/language_support_example.py
+++ b/docs/examples/language_support_example.py
@@ -0,0 +1,45 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy
+
+async def main():
+    # Example 1: Setting language when creating the crawler
+    crawler1 = AsyncWebCrawler(
+        crawler_strategy=AsyncPlaywrightCrawlerStrategy(
+            headers={"Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7"}
+        )
+    )
+    result1 = await crawler1.arun("https://www.example.com")
+    print("Example 1 result:", result1.extracted_content[:100])  # Print first 100 characters
+
+    # Example 2: Setting language before crawling
+    crawler2 = AsyncWebCrawler()
+    crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
+    result2 = await crawler2.arun("https://www.example.com")
+    print("Example 2 result:", result2.extracted_content[:100])
+
+    # Example 3: Setting language when calling arun method
+    crawler3 = AsyncWebCrawler()
+    result3 = await crawler3.arun(
+        "https://www.example.com",
+        headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
+    )
+    print("Example 3 result:", result3.extracted_content[:100])
+
+    # Example 4: Crawling multiple pages with different languages
+    urls = [
+        ("https://www.example.com", "fr-FR,fr;q=0.9"),
+        ("https://www.example.org", "es-ES,es;q=0.9"),
+        ("https://www.example.net", "de-DE,de;q=0.9"),
+    ]
+    
+    crawler4 = AsyncWebCrawler()
+    results = await asyncio.gather(*[
+        crawler4.arun(url, headers={"Accept-Language": lang})
+        for url, lang in urls
+    ])
+    
+    for url, result in zip([u for u, _ in urls], results):
+        print(f"Result for {url}:", result.extracted_content[:100])
+
+if __name__ == "__main__":
+    asyncio.run(main())