Release/v0.7.8 (#1662)

* Fix: Use correct URL variable for raw HTML extraction (#1116) - Prevents full HTML content from being passed as URL to extraction strategies - Added unit tests to verify raw HTML and regular URL processing Fix: Wrong URL variable used for extraction of raw html * Fix #1181: Preserve whitespace in code blocks during HTML scraping The remove_empty_elements_fast() method was removing whitespace-only span elements inside <pre> and <code> tags, causing import statements like "import torch" to become "importtorch". Now skips elements inside code blocks where whitespace is significant. * Refactor Pydantic model configuration to use ConfigDict for arbitrary types * Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621 * Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638 * fix: ensure BrowserConfig.to_dict serializes proxy_config * feat: make LLM backoff configurable end-to-end - extend LLMConfig with backoff delay/attempt/factor fields and thread them through LLMExtractionStrategy, LLMContentFilter, table extraction, and Docker API handlers - expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff and document them in the md_v2 guides * reproduced AttributeError from #1642 * pass timeout parameter to docker client request * added missing deep crawling objects to init * generalized query in ContentRelevanceFilter to be a str or list * import modules from enhanceable deserialization * parameterized tests * Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268 * refactor: replace PyPDF2 with pypdf across the codebase. ref #1412 * announcement: add application form for cloud API closed beta * Release v0.7.8: Stability & Bug Fix Release - Updated version to 0.7.8 - Introduced focused stability release addressing 11 community-reported bugs. - Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates. - Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended. - Updated documentation to reflect recent changes and improvements. * docs: add section for Crawl4AI Cloud API closed beta with application link * fix: add disk cleanup step to Docker workflow --------- Co-authored-by: rbushria <rbushri@gmail.com> Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com> Co-authored-by: Soham Kukreti <kukretisoham@gmail.com> Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com> Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
2025-12-11 18:04:52 +08:00
parent 835e3c56fe
commit a87e8c1c9e
32 changed files with 2123 additions and 135 deletions
--- a/tests/async/test_redirect_url_resolution.py
+++ b/tests/async/test_redirect_url_resolution.py
@@ -0,0 +1,118 @@
+"""Test delayed redirect WITH wait_for - does link resolution use correct URL?"""
+import asyncio
+import threading
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+
+class RedirectTestHandler(SimpleHTTPRequestHandler):
+    def log_message(self, format, *args):
+        pass
+
+    def do_GET(self):
+        if self.path == "/page-a":
+            self.send_response(200)
+            self.send_header("Content-type", "text/html")
+            self.end_headers()
+            content = """
+            <!DOCTYPE html>
+            <html>
+            <head><title>Page A</title></head>
+            <body>
+                <h1>Page A - Will redirect after 200ms</h1>
+                <script>
+                    setTimeout(function() {
+                        window.location.href = '/redirect-target/';
+                    }, 200);
+                </script>
+            </body>
+            </html>
+            """
+            self.wfile.write(content.encode())
+        elif self.path.startswith("/redirect-target"):
+            self.send_response(200)
+            self.send_header("Content-type", "text/html")
+            self.end_headers()
+            content = """
+            <!DOCTYPE html>
+            <html>
+            <head><title>Redirect Target</title></head>
+            <body>
+                <h1>Redirect Target</h1>
+                <nav id="target-nav">
+                    <a href="subpage-1">Subpage 1</a>
+                    <a href="subpage-2">Subpage 2</a>
+                </nav>
+            </body>
+            </html>
+            """
+            self.wfile.write(content.encode())
+        else:
+            self.send_response(404)
+            self.end_headers()
+
+async def main():
+    import socket
+    class ReuseAddrHTTPServer(HTTPServer):
+        allow_reuse_address = True
+    
+    server = ReuseAddrHTTPServer(("localhost", 8769), RedirectTestHandler)
+    thread = threading.Thread(target=server.serve_forever)
+    thread.daemon = True
+    thread.start()
+    
+    try:
+        import sys
+        sys.path.insert(0, '/Users/nasrin/vscode/c4ai-uc/develop')
+        from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+        
+        print("=" * 60)
+        print("TEST: Delayed JS redirect WITH wait_for='css:#target-nav'")
+        print("This waits for the redirect to complete")
+        print("=" * 60)
+        
+        browser_config = BrowserConfig(headless=True, verbose=False)
+        crawl_config = CrawlerRunConfig(
+            cache_mode="bypass",
+            wait_for="css:#target-nav"  # Wait for element on redirect target
+        )
+        
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(
+                url="http://localhost:8769/page-a",
+                config=crawl_config
+            )
+            
+            print(f"Original URL: http://localhost:8769/page-a")
+            print(f"Redirected URL returned: {result.redirected_url}")
+            print(f"HTML contains 'Redirect Target': {'Redirect Target' in result.html}")
+            print()
+            
+            if "/redirect-target" in (result.redirected_url or ""):
+                print("✓ redirected_url is CORRECT")
+            else:
+                print("✗ BUG #1: redirected_url is WRONG - still shows original URL!")
+                
+            # Check links
+            all_links = []
+            if isinstance(result.links, dict):
+                all_links = result.links.get("internal", []) + result.links.get("external", [])
+            
+            print(f"\nLinks found ({len(all_links)} total):")
+            bug_found = False
+            for link in all_links:
+                href = link.get("href", "") if isinstance(link, dict) else getattr(link, 'href', "")
+                if "subpage" in href:
+                    print(f"  {href}")
+                    if "/page-a/" in href:
+                        print("    ^^^ BUG #2: Link resolved with WRONG base URL!")
+                        bug_found = True
+                    elif "/redirect-target/" in href:
+                        print("    ^^^ CORRECT")
+            
+            if not bug_found and all_links:
+                print("\n✓ Link resolution is CORRECT")
+                        
+    finally:
+        server.shutdown()
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/check_dependencies.py
+++ b/tests/check_dependencies.py
@@ -71,7 +71,7 @@ PACKAGE_MAPPINGS = {
    'sentence_transformers': 'sentence-transformers',
    'rank_bm25': 'rank-bm25',
    'snowballstemmer': 'snowballstemmer',
-    'PyPDF2': 'PyPDF2',
+    'pypdf': 'pypdf',
    'pdf2image': 'pdf2image',
 }

--- a/tests/docker/test_filter_deep_crawl.py
+++ b/tests/docker/test_filter_deep_crawl.py
@@ -1,16 +1,31 @@
 """
 Test the complete fix for both the filter serialization and JSON serialization issues.
 """
+import os
+import traceback
+from typing import Any

 import asyncio
 import httpx

 from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
+from crawl4ai.deep_crawling import (
+    BFSDeepCrawlStrategy,
+    ContentRelevanceFilter,
+    FilterChain,
+    URLFilter,
+    URLPatternFilter,
+)

-BASE_URL = "http://localhost:11234/"  # Adjust port as needed
+CRAWL4AI_DOCKER_PORT = os.environ.get("CRAWL4AI_DOCKER_PORT", "11234")
+try:
+    BASE_PORT = int(CRAWL4AI_DOCKER_PORT)
+except TypeError:
+    BASE_PORT = 11234
+BASE_URL = f"http://localhost:{BASE_PORT}/"  # Adjust port as needed

-async def test_with_docker_client():
+
+async def test_with_docker_client(filter_chain: list[URLFilter], max_pages: int = 20, timeout: int = 30) -> bool:
    """Test using the Docker client (same as 1419.py)."""
    from crawl4ai.docker_client import Crawl4aiDockerClient
    
@@ -24,19 +39,10 @@ async def test_with_docker_client():
            verbose=True,
        ) as client:
            
-            # Create filter chain - testing the serialization fix
-            filter_chain = [
-                URLPatternFilter(
-                    # patterns=["*about*", "*privacy*", "*terms*"],
-                    patterns=["*advanced*"],
-                    reverse=True
-                ),
-            ]
-            
            crawler_config = CrawlerRunConfig(
                deep_crawl_strategy=BFSDeepCrawlStrategy(
                    max_depth=2,  # Keep it shallow for testing
-                    # max_pages=5,  # Limit pages for testing
+                    max_pages=max_pages,  # Limit pages for testing
                    filter_chain=FilterChain(filter_chain)
                ),
                cache_mode=CacheMode.BYPASS,
@@ -47,6 +53,7 @@ async def test_with_docker_client():
                ["https://docs.crawl4ai.com"],  # Simple test page
                browser_config=BrowserConfig(headless=True),
                crawler_config=crawler_config,
+                hooks_timeout=timeout,
            )
            
            if results:
@@ -74,12 +81,11 @@ async def test_with_docker_client():
        
    except Exception as e:
        print(f"❌ Docker client test failed: {e}")
-        import traceback
        traceback.print_exc()
        return False


-async def test_with_rest_api():
+async def test_with_rest_api(filters: list[dict[str, Any]], max_pages: int = 20, timeout: int = 30) -> bool:
    """Test using REST API directly."""
    print("\n" + "=" * 60)
    print("Testing with REST API")
@@ -90,19 +96,11 @@ async def test_with_rest_api():
        "type": "BFSDeepCrawlStrategy",
        "params": {
            "max_depth": 2,
-            # "max_pages": 5,
+            "max_pages": max_pages,
            "filter_chain": {
                "type": "FilterChain",
                "params": {
-                    "filters": [
-                        {
-                            "type": "URLPatternFilter",
-                            "params": {
-                                "patterns": ["*advanced*"],
-                                "reverse": True
-                            }
-                        }
-                    ]
+                    "filters": filters
                }
            }
        }
@@ -126,7 +124,7 @@ async def test_with_rest_api():
            response = await client.post(
                f"{BASE_URL}crawl",
                json=crawl_payload,
-                timeout=30
+                timeout=timeout,
            )
            
            if response.status_code == 200:
@@ -150,7 +148,6 @@ async def test_with_rest_api():
        
    except Exception as e:
        print(f"❌ REST API test failed: {e}")
-        import traceback
        traceback.print_exc()
        return False

@@ -165,12 +162,62 @@ async def main():
    results = []
    
    # Test 1: Docker client
-    docker_passed = await test_with_docker_client()
-    results.append(("Docker Client", docker_passed))
+    max_pages_ = [20, 5]
+    timeouts = [30, 60]
+    filter_chain_test_cases = [
+        [
+            URLPatternFilter(
+                # patterns=["*about*", "*privacy*", "*terms*"],
+                patterns=["*advanced*"],
+                reverse=True
+            ),
+        ],
+        [
+            ContentRelevanceFilter(
+                query="about faq",
+                threshold=0.2,
+            ),
+        ],
+    ]
+    for idx, (filter_chain, max_pages, timeout) in enumerate(zip(filter_chain_test_cases, max_pages_, timeouts)):
+        docker_passed = await test_with_docker_client(filter_chain=filter_chain, max_pages=max_pages, timeout=timeout)
+        results.append((f"Docker Client w/ filter chain {idx}", docker_passed))
    
    # Test 2: REST API
-    rest_passed = await test_with_rest_api()
-    results.append(("REST API", rest_passed))
+    max_pages_ = [20, 5, 5]
+    timeouts = [30, 60, 60]
+    filters_test_cases = [
+        [
+            {
+                "type": "URLPatternFilter",
+                "params": {
+                    "patterns": ["*advanced*"],
+                    "reverse": True
+                }
+            }
+        ],
+        [
+            {
+                "type": "ContentRelevanceFilter",
+                "params": {
+                    "query": "about faq",
+                    "threshold": 0.2,
+                }
+            }
+        ],
+        [
+            {
+                "type": "ContentRelevanceFilter",
+                "params": {
+                    "query": ["about", "faq"],
+                    "threshold": 0.2,
+                }
+            }
+        ],
+    ]
+    for idx, (filters, max_pages, timeout) in enumerate(zip(filters_test_cases, max_pages_, timeouts)):
+        rest_passed = await test_with_rest_api(filters=filters, max_pages=max_pages, timeout=timeout)
+        results.append((f"REST API w/ filters {idx}", rest_passed))
    
    # Summary
    print("\n" + "=" * 60)
@@ -186,10 +233,7 @@ async def main():
    
    print("=" * 60)
    if all_passed:
-        print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
-        print("\nThe fixes:")
-        print("1. Filter serialization: Fixed by not serializing private __slots__")
-        print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
+        print("🎉 ALL TESTS PASSED!")
    else:
        print("⚠️ Some tests failed. Please check the server logs for details.")
    
@@ -198,4 +242,4 @@ async def main():

 if __name__ == "__main__":
    import sys
-    sys.exit(asyncio.run(main()))
+    sys.exit(asyncio.run(main()))
--- a/tests/general/test_async_webcrawler.py
+++ b/tests/general/test_async_webcrawler.py
@@ -9,6 +9,21 @@ from crawl4ai import (
    RateLimiter,
    CacheMode
 )
+from crawl4ai.extraction_strategy import ExtractionStrategy
+
+class MockExtractionStrategy(ExtractionStrategy):
+    """Mock extraction strategy for testing URL parameter handling"""
+
+    def __init__(self):
+        super().__init__()
+        self.run_calls = []
+
+    def extract(self, url: str, html: str, *args, **kwargs):
+        return [{"test": "data"}]
+
+    def run(self, url: str, sections: List[str], *args, **kwargs):
+        self.run_calls.append(url)
+        return super().run(url, sections, *args, **kwargs)

@pytest.mark.asyncio
@pytest.mark.parametrize("viewport", [
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
        assert not result.success
        assert result.error_message is not None

+@pytest.mark.asyncio
+async def test_extraction_strategy_run_with_regular_url():
+    """
+    Regression test for extraction_strategy.run URL parameter handling with regular URLs.
+
+    This test verifies that when is_raw_html=False (regular URL),
+    extraction_strategy.run is called with the actual URL.
+    """
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        mock_strategy = MockExtractionStrategy()
+
+        # Test regular URL (is_raw_html=False)
+        regular_url = "https://example.com"
+        result = await crawler.arun(
+            url=regular_url,
+            config=CrawlerRunConfig(
+                page_timeout=30000,
+                extraction_strategy=mock_strategy,
+                cache_mode=CacheMode.BYPASS
+            )
+        )
+
+        assert result.success
+        assert len(mock_strategy.run_calls) == 1
+        assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
+
+@pytest.mark.asyncio
+async def test_extraction_strategy_run_with_raw_html():
+    """
+    Regression test for extraction_strategy.run URL parameter handling with raw HTML.
+
+    This test verifies that when is_raw_html=True (URL starts with "raw:"),
+    extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
+    """
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        mock_strategy = MockExtractionStrategy()
+
+        # Test raw HTML URL (is_raw_html=True automatically set)
+        raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
+        result = await crawler.arun(
+            url=raw_html_url,
+            config=CrawlerRunConfig(
+                page_timeout=30000,
+                extraction_strategy=mock_strategy,
+                cache_mode=CacheMode.BYPASS
+            )
+        )
+
+        assert result.success
+        assert len(mock_strategy.run_calls) == 1
+        assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
+
 if __name__ == "__main__":
    asyncio.run(test_viewport_config((1024, 768)))
    asyncio.run(test_memory_management())
    asyncio.run(test_rate_limiting())
-    asyncio.run(test_javascript_execution())
+    asyncio.run(test_javascript_execution())
+    asyncio.run(test_extraction_strategy_run_with_regular_url())
+    asyncio.run(test_extraction_strategy_run_with_raw_html())