Merge pull request #9 from aravindkarnam/main

Pulling version 0.4.22 from main into scraper
2024-12-17 18:43:36 +05:30
parent 2f5e0598bb ed7bc1909c
commit 7c0fa269a6
72 changed files with 10610 additions and 5540 deletions
--- a/tests/async/test_0.4.2_browser_manager.py
+++ b/tests/async/test_0.4.2_browser_manager.py
@@ -0,0 +1,153 @@
+import os, sys
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(    os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import os, sys
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+# Assuming that the changes made allow different configurations 
+# for managed browser, persistent context, and so forth.
+
+async def test_default_headless():
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+        use_managed_browser=False,
+        use_persistent_context=False,
+        ignore_https_errors=True,
+        # Testing normal ephemeral context
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://www.kidocode.com/degrees/technology',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+        )
+        print("[test_default_headless] success:", result.success)
+        print("HTML length:", len(result.html if result.html else ""))
+        
+async def test_managed_browser_persistent():
+    # Treating use_persistent_context=True as managed_browser scenario.
+    async with AsyncWebCrawler(
+        headless=False,
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "desktop", "os_type": "mac"},
+        use_managed_browser=True,
+        use_persistent_context=True,  # now should behave same as managed browser
+        user_data_dir="./outpu/test_profile",
+        # This should store and reuse profile data across runs
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://www.google.com',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_managed_browser_persistent] success:", result.success)
+        print("HTML length:", len(result.html if result.html else ""))
+
+async def test_session_reuse():
+    # Test creating a session, using it for multiple calls
+    session_id = "my_session"
+    async with AsyncWebCrawler(
+        headless=False,
+        verbose=True,
+        user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
+        # Fixed user-agent for consistency
+        use_managed_browser=False,
+        use_persistent_context=False,
+    ) as crawler:
+        
+        # First call: create session
+        result1 = await crawler.arun(
+            url='https://www.example.com',
+            cache_mode=CacheMode.BYPASS,
+            session_id=session_id,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_session_reuse first call] success:", result1.success)
+        
+        # Second call: same session, possibly cookie retained
+        result2 = await crawler.arun(
+            url='https://www.example.com/about',
+            cache_mode=CacheMode.BYPASS,
+            session_id=session_id,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_session_reuse second call] success:", result2.success)
+
+async def test_magic_mode():
+    # Test magic mode with override_navigator and simulate_user
+    async with AsyncWebCrawler(
+        headless=False,
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
+        use_managed_browser=False,
+        use_persistent_context=False,
+        magic=True,
+        override_navigator=True,
+        simulate_user=True,
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://www.kidocode.com/degrees/business',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_magic_mode] success:", result.success)
+        print("HTML length:", len(result.html if result.html else ""))
+
+async def test_proxy_settings():
+    # Test with a proxy (if available) to ensure code runs with proxy
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=False,
+        user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
+        proxy="http://127.0.0.1:8080",  # Assuming local proxy server for test
+        use_managed_browser=False,
+        use_persistent_context=False,
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://httpbin.org/ip',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_proxy_settings] success:", result.success)
+        if result.success:
+            print("HTML preview:", result.html[:200] if result.html else "")
+
+async def test_ignore_https_errors():
+    # Test ignore HTTPS errors with a self-signed or invalid cert domain
+    # This is just conceptual, the domain should be one that triggers SSL error.
+    # Using a hypothetical URL that fails SSL:
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=True,
+        user_agent="Mozilla/5.0",
+        ignore_https_errors=True,
+        use_managed_browser=False,
+        use_persistent_context=False,
+    ) as crawler:
+        result = await crawler.arun(
+            url='https://self-signed.badssl.com/',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+        )
+        print("[test_ignore_https_errors] success:", result.success)
+
+async def main():
+    print("Running tests...")
+    # await test_default_headless()
+    # await test_managed_browser_persistent()
+    # await test_session_reuse()
+    # await test_magic_mode()
+    # await test_proxy_settings()
+    await test_ignore_https_errors()
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/async/test_0.4.2_config_params.py
+++ b/tests/async/test_0.4.2_config_params.py
@@ -0,0 +1,231 @@
+import os, sys
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig      
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.chunking_strategy import RegexChunking
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+# Category 1: Browser Configuration Tests
+async def test_browser_config_object():
+    """Test the new BrowserConfig object with various browser settings"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=False,
+        viewport_width=1920,
+        viewport_height=1080,
+        use_managed_browser=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
+    )
+    
+    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
+        result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS)
+        assert result.success, "Browser config crawl failed"
+        assert len(result.html) > 0, "No HTML content retrieved"
+
+async def test_browser_performance_config():
+    """Test browser configurations focused on performance"""
+    browser_config = BrowserConfig(
+        text_only=True,
+        light_mode=True,
+        extra_args=['--disable-gpu', '--disable-software-rasterizer'],
+        ignore_https_errors=True,
+        java_script_enabled=False
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun('https://example.com')
+        assert result.success, "Performance optimized crawl failed"
+        assert result.status_code == 200, "Unexpected status code"
+
+# Category 2: Content Processing Tests
+async def test_content_extraction_config():
+    """Test content extraction with various strategies"""
+    crawler_config = CrawlerRunConfig(
+        word_count_threshold=300,
+        extraction_strategy=JsonCssExtractionStrategy(
+            schema={
+                "name": "article",
+                "baseSelector": "div",
+                "fields": [{
+                    "name": "title",
+                    "selector": "h1",
+                    "type": "text"
+                }]
+            }
+        ),
+        chunking_strategy=RegexChunking(),
+        content_filter=PruningContentFilter()
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            'https://example.com/article',
+            config=crawler_config
+        )
+        assert result.extracted_content is not None, "Content extraction failed"
+        assert 'title' in result.extracted_content, "Missing expected content field"
+
+# Category 3: Cache and Session Management Tests
+async def test_cache_and_session_management():
+    """Test different cache modes and session handling"""
+    browser_config = BrowserConfig(use_persistent_context=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.WRITE_ONLY,
+        process_iframes=True,
+        remove_overlay_elements=True
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # First request - should write to cache
+        result1 = await crawler.arun(
+            'https://example.com',
+            config=crawler_config
+        )
+        
+        # Second request - should use fresh fetch due to WRITE_ONLY mode
+        result2 = await crawler.arun(
+            'https://example.com',
+            config=crawler_config
+        )
+        
+        assert result1.success and result2.success, "Cache mode crawl failed"
+        assert result1.html == result2.html, "Inconsistent results between requests"
+
+# Category 4: Media Handling Tests
+async def test_media_handling_config():
+    """Test configurations related to media handling"""
+    # Get the base path for home directroy ~/.crawl4ai/downloads, make sure it exists
+    os.makedirs(os.path.expanduser("~/.crawl4ai/downloads"), exist_ok=True)
+    browser_config = BrowserConfig(
+        viewport_width=1920,
+        viewport_height=1080,
+        accept_downloads=True,
+        downloads_path= os.path.expanduser("~/.crawl4ai/downloads")
+    )
+    crawler_config = CrawlerRunConfig(
+        screenshot=True,
+        pdf=True,
+        adjust_viewport_to_content=True,
+        wait_for_images=True,
+        screenshot_height_threshold=20000
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            'https://example.com',
+            config=crawler_config
+        )
+        assert result.screenshot is not None, "Screenshot capture failed"
+        assert result.pdf is not None, "PDF generation failed"
+
+# Category 5: Anti-Bot and Site Interaction Tests
+async def test_antibot_config():
+    """Test configurations for handling anti-bot measures"""
+    crawler_config = CrawlerRunConfig(
+        simulate_user=True,
+        override_navigator=True,
+        magic=True,
+        wait_for="js:()=>document.querySelector('body')",
+        delay_before_return_html=1.0,
+        log_console=True,
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            'https://example.com',
+            config=crawler_config
+        )
+        assert result.success, "Anti-bot measure handling failed"
+
+# Category 6: Parallel Processing Tests
+async def test_parallel_processing():
+    """Test parallel processing capabilities"""
+    crawler_config = CrawlerRunConfig(
+        mean_delay=0.5,
+        max_range=1.0,
+        semaphore_count=5
+    )
+    
+    urls = [
+        'https://example.com/1',
+        'https://example.com/2',
+        'https://example.com/3'
+    ]
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls,
+            config=crawler_config
+        )
+        assert len(results) == len(urls), "Not all URLs were processed"
+        assert all(r.success for r in results), "Some parallel requests failed"
+
+# Category 7: Backwards Compatibility Tests
+async def test_legacy_parameter_support():
+    """Test that legacy parameters still work"""
+    async with AsyncWebCrawler(
+        headless=True,
+        browser_type="chromium",
+        viewport_width=1024,
+        viewport_height=768
+    ) as crawler:
+        result = await crawler.arun(
+            'https://example.com',
+            screenshot=True,
+            word_count_threshold=200,
+            bypass_cache=True,
+            css_selector=".main-content"
+        )
+        assert result.success, "Legacy parameter support failed"
+
+# Category 8: Mixed Configuration Tests
+async def test_mixed_config_usage():
+    """Test mixing new config objects with legacy parameters"""
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(screenshot=True)
+    
+    async with AsyncWebCrawler(
+        config=browser_config,
+        verbose=True  # legacy parameter
+    ) as crawler:
+        result = await crawler.arun(
+            'https://example.com',
+            config=crawler_config,
+            cache_mode=CacheMode.BYPASS,  # legacy parameter
+            css_selector="body"  # legacy parameter
+        )
+        assert result.success, "Mixed configuration usage failed"
+
+if __name__ == "__main__":
+    async def run_tests():
+        test_functions = [
+            test_browser_config_object,
+            # test_browser_performance_config,
+            # test_content_extraction_config,
+            # test_cache_and_session_management,
+            # test_media_handling_config,
+            # test_antibot_config,
+            # test_parallel_processing,
+            # test_legacy_parameter_support,
+            # test_mixed_config_usage
+        ]
+        
+        for test in test_functions:
+            print(f"\nRunning {test.__name__}...")
+            try:
+                await test()
+                print(f"✓ {test.__name__} passed")
+            except AssertionError as e:
+                print(f"✗ {test.__name__} failed: {str(e)}")
+            except Exception as e:
+                print(f"✗ {test.__name__} error: {str(e)}")
+    
+    asyncio.run(run_tests())
--- a/tests/async/test_content_filter_bm25.py
+++ b/tests/async/test_content_filter_bm25.py
--- a/tests/async/test_content_filter_prune.py
+++ b/tests/async/test_content_filter_prune.py
@@ -0,0 +1,159 @@
+import os, sys
+import pytest
+from bs4 import BeautifulSoup
+
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+@pytest.fixture
+def basic_html():
+    return """
+    <html>
+        <body>
+            <article>
+                <h1>Main Article</h1>
+                <p>This is a high-quality paragraph with substantial text content. It contains enough words to pass the threshold and has good text density without too many links. This kind of content should survive the pruning process.</p>
+                <div class="sidebar">Low quality sidebar content</div>
+                <div class="social-share">Share buttons</div>
+            </article>
+        </body>
+    </html>
+    """
+
+@pytest.fixture
+def link_heavy_html():
+    return """
+    <html>
+        <body>
+            <div class="content">
+                <p>Good content paragraph that should remain.</p>
+                <div class="links">
+                    <a href="#">Link 1</a>
+                    <a href="#">Link 2</a>
+                    <a href="#">Link 3</a>
+                    <a href="#">Link 4</a>
+                </div>
+            </div>
+        </body>
+    </html>
+    """
+
+@pytest.fixture
+def mixed_content_html():
+    return """
+    <html>
+        <body>
+            <article>
+                <h1>Article Title</h1>
+                <p class="summary">Short summary.</p>
+                <div class="content">
+                    <p>Long high-quality paragraph with substantial content that should definitely survive the pruning process. This content has good text density and proper formatting which makes it valuable for retention.</p>
+                </div>
+                <div class="comments">
+                    <p>Short comment 1</p>
+                    <p>Short comment 2</p>
+                </div>
+            </article>
+        </body>
+    </html>
+    """
+
+class TestPruningContentFilter:
+    def test_basic_pruning(self, basic_html):
+        """Test basic content pruning functionality"""
+        filter = PruningContentFilter(min_word_threshold=5)
+        contents = filter.filter_content(basic_html)
+        
+        combined_content = ' '.join(contents).lower()
+        assert "high-quality paragraph" in combined_content
+        assert "sidebar content" not in combined_content
+        assert "share buttons" not in combined_content
+
+    def test_min_word_threshold(self, mixed_content_html):
+        """Test minimum word threshold filtering"""
+        filter = PruningContentFilter(min_word_threshold=10)
+        contents = filter.filter_content(mixed_content_html)
+        
+        combined_content = ' '.join(contents).lower()
+        assert "short summary" not in combined_content
+        assert "long high-quality paragraph" in combined_content
+        assert "short comment" not in combined_content
+
+    def test_threshold_types(self, basic_html):
+        """Test fixed vs dynamic thresholds"""
+        fixed_filter = PruningContentFilter(threshold_type='fixed', threshold=0.48)
+        dynamic_filter = PruningContentFilter(threshold_type='dynamic', threshold=0.45)
+        
+        fixed_contents = fixed_filter.filter_content(basic_html)
+        dynamic_contents = dynamic_filter.filter_content(basic_html)
+        
+        assert len(fixed_contents) != len(dynamic_contents), \
+            "Fixed and dynamic thresholds should yield different results"
+
+    def test_link_density_impact(self, link_heavy_html):
+        """Test handling of link-heavy content"""
+        filter = PruningContentFilter(threshold_type='dynamic')
+        contents = filter.filter_content(link_heavy_html)
+        
+        combined_content = ' '.join(contents).lower()
+        assert "good content paragraph" in combined_content
+        assert len([c for c in contents if 'href' in c]) < 2, \
+            "Should prune link-heavy sections"
+
+    def test_tag_importance(self, mixed_content_html):
+        """Test tag importance in scoring"""
+        filter = PruningContentFilter(threshold_type='dynamic')
+        contents = filter.filter_content(mixed_content_html)
+        
+        has_article = any('article' in c.lower() for c in contents)
+        has_h1 = any('h1' in c.lower() for c in contents)
+        assert has_article or has_h1, "Should retain important tags"
+
+    def test_empty_input(self):
+        """Test handling of empty input"""
+        filter = PruningContentFilter()
+        assert filter.filter_content("") == []
+        assert filter.filter_content(None) == []
+
+    def test_malformed_html(self):
+        """Test handling of malformed HTML"""
+        malformed_html = "<div>Unclosed div<p>Nested<span>content</div>"
+        filter = PruningContentFilter()
+        contents = filter.filter_content(malformed_html)
+        assert isinstance(contents, list)
+
+    def test_performance(self, basic_html):
+        """Test performance with timer"""
+        filter = PruningContentFilter()
+        
+        import time
+        start = time.perf_counter()
+        filter.filter_content(basic_html)
+        duration = time.perf_counter() - start
+        
+        # Extra strict on performance since you mentioned milliseconds matter
+        assert duration < 0.1, f"Processing took too long: {duration:.3f} seconds"
+
+    @pytest.mark.parametrize("threshold,expected_count", [
+        (0.3, 4),  # Very lenient
+        (0.48, 2), # Default
+        (0.7, 1),  # Very strict
+    ])
+    def test_threshold_levels(self, mixed_content_html, threshold, expected_count):
+        """Test different threshold levels"""
+        filter = PruningContentFilter(threshold_type='fixed', threshold=threshold)
+        contents = filter.filter_content(mixed_content_html)
+        assert len(contents) <= expected_count, \
+            f"Expected {expected_count} or fewer elements with threshold {threshold}"
+
+    def test_consistent_output(self, basic_html):
+        """Test output consistency across multiple runs"""
+        filter = PruningContentFilter()
+        first_run = filter.filter_content(basic_html)
+        second_run = filter.filter_content(basic_html)
+        assert first_run == second_run, "Output should be consistent"
+
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/tests/async/test_markdown_genertor.py
+++ b/tests/async/test_markdown_genertor.py
@@ -11,7 +11,7 @@ import asyncio
 import os
 import time
 from typing import Dict, Any
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

 # Get current directory
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@@ -41,7 +41,7 @@ def test_basic_markdown_conversion():
    with open(__location__ + "/data/wikipedia.html", "r") as f:
        cleaned_html = f.read()

-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
    
    start_time = time.perf_counter()
    result = generator.generate_markdown(
@@ -70,7 +70,7 @@ def test_relative_links():
    Also an [image](/images/test.png) and another [page](/wiki/Banana).
    """
    
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://en.wikipedia.org"
@@ -86,7 +86,7 @@ def test_duplicate_links():
    Here's a [link](/test) and another [link](/test) and a [different link](/other).
    """
    
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://example.com"
@@ -102,7 +102,7 @@ def test_link_descriptions():
    Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
    """
    
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://example.com"
@@ -120,7 +120,7 @@ def test_performance_large_document():
    iterations = 5
    times = []
    
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
    
    for i in range(iterations):
        start_time = time.perf_counter()
@@ -144,7 +144,7 @@ def test_image_links():
    And a regular [link](/page).
    """
    
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
        cleaned_html=markdown,
        base_url="https://example.com"