Implement new async crawler features and stability updates

- Introduced new async crawl strategy with session management. - Added BrowserManager for improved browser management. - Enhanced documentation, focusing on storage state and usage examples. - Improved error handling and logging for sessions. - Added JavaScript snippets for customizing navigator properties.
2024-12-10 17:55:29 +08:00
parent 2d31915f0a
commit e130fd8db9
16 changed files with 2750 additions and 749 deletions
--- a/tests/async/test_0.4.2_browser_manager.py
+++ b/tests/async/test_0.4.2_browser_manager.py
@@ -0,0 +1,153 @@
+import os, sys
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(    os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import os, sys
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+# Assuming that the changes made allow different configurations 
+# for managed browser, persistent context, and so forth.
+
+async def test_default_headless():
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+        use_managed_browser=False,
+        use_persistent_context=False,
+        ignore_https_errors=True,
+        # Testing normal ephemeral context
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://www.kidocode.com/degrees/technology',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+        )
+        print("[test_default_headless] success:", result.success)
+        print("HTML length:", len(result.html if result.html else ""))
+        
+async def test_managed_browser_persistent():
+    # Treating use_persistent_context=True as managed_browser scenario.
+    async with AsyncWebCrawler(
+        headless=False,
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "desktop", "os_type": "mac"},
+        use_managed_browser=True,
+        use_persistent_context=True,  # now should behave same as managed browser
+        user_data_dir="./outpu/test_profile",
+        # This should store and reuse profile data across runs
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://www.google.com',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_managed_browser_persistent] success:", result.success)
+        print("HTML length:", len(result.html if result.html else ""))
+
+async def test_session_reuse():
+    # Test creating a session, using it for multiple calls
+    session_id = "my_session"
+    async with AsyncWebCrawler(
+        headless=False,
+        verbose=True,
+        user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
+        # Fixed user-agent for consistency
+        use_managed_browser=False,
+        use_persistent_context=False,
+    ) as crawler:
+        
+        # First call: create session
+        result1 = await crawler.arun(
+            url='https://www.example.com',
+            cache_mode=CacheMode.BYPASS,
+            session_id=session_id,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_session_reuse first call] success:", result1.success)
+        
+        # Second call: same session, possibly cookie retained
+        result2 = await crawler.arun(
+            url='https://www.example.com/about',
+            cache_mode=CacheMode.BYPASS,
+            session_id=session_id,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_session_reuse second call] success:", result2.success)
+
+async def test_magic_mode():
+    # Test magic mode with override_navigator and simulate_user
+    async with AsyncWebCrawler(
+        headless=False,
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
+        use_managed_browser=False,
+        use_persistent_context=False,
+        magic=True,
+        override_navigator=True,
+        simulate_user=True,
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://www.kidocode.com/degrees/business',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_magic_mode] success:", result.success)
+        print("HTML length:", len(result.html if result.html else ""))
+
+async def test_proxy_settings():
+    # Test with a proxy (if available) to ensure code runs with proxy
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=False,
+        user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
+        proxy="http://127.0.0.1:8080",  # Assuming local proxy server for test
+        use_managed_browser=False,
+        use_persistent_context=False,
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://httpbin.org/ip',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_proxy_settings] success:", result.success)
+        if result.success:
+            print("HTML preview:", result.html[:200] if result.html else "")
+
+async def test_ignore_https_errors():
+    # Test ignore HTTPS errors with a self-signed or invalid cert domain
+    # This is just conceptual, the domain should be one that triggers SSL error.
+    # Using a hypothetical URL that fails SSL:
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=True,
+        user_agent="Mozilla/5.0",
+        ignore_https_errors=True,
+        use_managed_browser=False,
+        use_persistent_context=False,
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://self-signed.badssl.com/',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_ignore_https_errors] success:", result.success)
+
+async def main():
+    print("Running tests...")
+    # await test_default_headless()
+    # await test_managed_browser_persistent()
+    # await test_session_reuse()
+    # await test_magic_mode()
+    # await test_proxy_settings()
+    await test_ignore_https_errors()
+
+if __name__ == "__main__":
+    asyncio.run(main())