Release/v0.7.8 (#1662)

* Fix: Use correct URL variable for raw HTML extraction (#1116)

- Prevents full HTML content from being passed as URL to extraction strategies
- Added unit tests to verify raw HTML and regular URL processing

Fix: Wrong URL variable used for extraction of raw html

* Fix #1181: Preserve whitespace in code blocks during HTML scraping

  The remove_empty_elements_fast() method was removing whitespace-only
  span elements inside <pre> and <code> tags, causing import statements
  like "import torch" to become "importtorch". Now skips elements inside
  code blocks where whitespace is significant.

* Refactor Pydantic model configuration to use ConfigDict for arbitrary types

* Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621

* Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638

* fix: ensure BrowserConfig.to_dict serializes proxy_config

* feat: make LLM backoff configurable end-to-end

- extend LLMConfig with backoff delay/attempt/factor fields and thread them
  through LLMExtractionStrategy, LLMContentFilter, table extraction, and
  Docker API handlers
- expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff
  and document them in the md_v2 guides

* reproduced AttributeError from #1642

* pass timeout parameter to docker client request

* added missing deep crawling objects to init

* generalized query in ContentRelevanceFilter to be a str or list

* import modules from enhanceable deserialization

* parameterized tests

* Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268

* refactor: replace PyPDF2 with pypdf across the codebase. ref #1412

* announcement: add application form for cloud API closed beta

* Release v0.7.8: Stability & Bug Fix Release

- Updated version to 0.7.8
- Introduced focused stability release addressing 11 community-reported bugs.
- Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates.
- Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended.
- Updated documentation to reflect recent changes and improvements.

* docs: add section for Crawl4AI Cloud API closed beta with application link

* fix: add disk cleanup step to Docker workflow

---------

Co-authored-by: rbushria <rbushri@gmail.com>
Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com>
Co-authored-by: Soham Kukreti <kukretisoham@gmail.com>
Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com>
Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
This commit is contained in:
Nasrin
2025-12-11 18:04:52 +08:00
committed by GitHub
parent 835e3c56fe
commit a87e8c1c9e
32 changed files with 2123 additions and 135 deletions

View File

@@ -0,0 +1,118 @@
"""Test delayed redirect WITH wait_for - does link resolution use correct URL?"""
import asyncio
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
class RedirectTestHandler(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass
def do_GET(self):
if self.path == "/page-a":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
content = """
<!DOCTYPE html>
<html>
<head><title>Page A</title></head>
<body>
<h1>Page A - Will redirect after 200ms</h1>
<script>
setTimeout(function() {
window.location.href = '/redirect-target/';
}, 200);
</script>
</body>
</html>
"""
self.wfile.write(content.encode())
elif self.path.startswith("/redirect-target"):
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
content = """
<!DOCTYPE html>
<html>
<head><title>Redirect Target</title></head>
<body>
<h1>Redirect Target</h1>
<nav id="target-nav">
<a href="subpage-1">Subpage 1</a>
<a href="subpage-2">Subpage 2</a>
</nav>
</body>
</html>
"""
self.wfile.write(content.encode())
else:
self.send_response(404)
self.end_headers()
async def main():
import socket
class ReuseAddrHTTPServer(HTTPServer):
allow_reuse_address = True
server = ReuseAddrHTTPServer(("localhost", 8769), RedirectTestHandler)
thread = threading.Thread(target=server.serve_forever)
thread.daemon = True
thread.start()
try:
import sys
sys.path.insert(0, '/Users/nasrin/vscode/c4ai-uc/develop')
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
print("=" * 60)
print("TEST: Delayed JS redirect WITH wait_for='css:#target-nav'")
print("This waits for the redirect to complete")
print("=" * 60)
browser_config = BrowserConfig(headless=True, verbose=False)
crawl_config = CrawlerRunConfig(
cache_mode="bypass",
wait_for="css:#target-nav" # Wait for element on redirect target
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="http://localhost:8769/page-a",
config=crawl_config
)
print(f"Original URL: http://localhost:8769/page-a")
print(f"Redirected URL returned: {result.redirected_url}")
print(f"HTML contains 'Redirect Target': {'Redirect Target' in result.html}")
print()
if "/redirect-target" in (result.redirected_url or ""):
print("✓ redirected_url is CORRECT")
else:
print("✗ BUG #1: redirected_url is WRONG - still shows original URL!")
# Check links
all_links = []
if isinstance(result.links, dict):
all_links = result.links.get("internal", []) + result.links.get("external", [])
print(f"\nLinks found ({len(all_links)} total):")
bug_found = False
for link in all_links:
href = link.get("href", "") if isinstance(link, dict) else getattr(link, 'href', "")
if "subpage" in href:
print(f" {href}")
if "/page-a/" in href:
print(" ^^^ BUG #2: Link resolved with WRONG base URL!")
bug_found = True
elif "/redirect-target/" in href:
print(" ^^^ CORRECT")
if not bug_found and all_links:
print("\n✓ Link resolution is CORRECT")
finally:
server.shutdown()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -71,7 +71,7 @@ PACKAGE_MAPPINGS = {
'sentence_transformers': 'sentence-transformers',
'rank_bm25': 'rank-bm25',
'snowballstemmer': 'snowballstemmer',
'PyPDF2': 'PyPDF2',
'pypdf': 'pypdf',
'pdf2image': 'pdf2image',
}

View File

@@ -1,16 +1,31 @@
"""
Test the complete fix for both the filter serialization and JSON serialization issues.
"""
import os
import traceback
from typing import Any
import asyncio
import httpx
from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
from crawl4ai.deep_crawling import (
BFSDeepCrawlStrategy,
ContentRelevanceFilter,
FilterChain,
URLFilter,
URLPatternFilter,
)
BASE_URL = "http://localhost:11234/" # Adjust port as needed
CRAWL4AI_DOCKER_PORT = os.environ.get("CRAWL4AI_DOCKER_PORT", "11234")
try:
BASE_PORT = int(CRAWL4AI_DOCKER_PORT)
except TypeError:
BASE_PORT = 11234
BASE_URL = f"http://localhost:{BASE_PORT}/" # Adjust port as needed
async def test_with_docker_client():
async def test_with_docker_client(filter_chain: list[URLFilter], max_pages: int = 20, timeout: int = 30) -> bool:
"""Test using the Docker client (same as 1419.py)."""
from crawl4ai.docker_client import Crawl4aiDockerClient
@@ -24,19 +39,10 @@ async def test_with_docker_client():
verbose=True,
) as client:
# Create filter chain - testing the serialization fix
filter_chain = [
URLPatternFilter(
# patterns=["*about*", "*privacy*", "*terms*"],
patterns=["*advanced*"],
reverse=True
),
]
crawler_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2, # Keep it shallow for testing
# max_pages=5, # Limit pages for testing
max_pages=max_pages, # Limit pages for testing
filter_chain=FilterChain(filter_chain)
),
cache_mode=CacheMode.BYPASS,
@@ -47,6 +53,7 @@ async def test_with_docker_client():
["https://docs.crawl4ai.com"], # Simple test page
browser_config=BrowserConfig(headless=True),
crawler_config=crawler_config,
hooks_timeout=timeout,
)
if results:
@@ -74,12 +81,11 @@ async def test_with_docker_client():
except Exception as e:
print(f"❌ Docker client test failed: {e}")
import traceback
traceback.print_exc()
return False
async def test_with_rest_api():
async def test_with_rest_api(filters: list[dict[str, Any]], max_pages: int = 20, timeout: int = 30) -> bool:
"""Test using REST API directly."""
print("\n" + "=" * 60)
print("Testing with REST API")
@@ -90,19 +96,11 @@ async def test_with_rest_api():
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 2,
# "max_pages": 5,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "URLPatternFilter",
"params": {
"patterns": ["*advanced*"],
"reverse": True
}
}
]
"filters": filters
}
}
}
@@ -126,7 +124,7 @@ async def test_with_rest_api():
response = await client.post(
f"{BASE_URL}crawl",
json=crawl_payload,
timeout=30
timeout=timeout,
)
if response.status_code == 200:
@@ -150,7 +148,6 @@ async def test_with_rest_api():
except Exception as e:
print(f"❌ REST API test failed: {e}")
import traceback
traceback.print_exc()
return False
@@ -165,12 +162,62 @@ async def main():
results = []
# Test 1: Docker client
docker_passed = await test_with_docker_client()
results.append(("Docker Client", docker_passed))
max_pages_ = [20, 5]
timeouts = [30, 60]
filter_chain_test_cases = [
[
URLPatternFilter(
# patterns=["*about*", "*privacy*", "*terms*"],
patterns=["*advanced*"],
reverse=True
),
],
[
ContentRelevanceFilter(
query="about faq",
threshold=0.2,
),
],
]
for idx, (filter_chain, max_pages, timeout) in enumerate(zip(filter_chain_test_cases, max_pages_, timeouts)):
docker_passed = await test_with_docker_client(filter_chain=filter_chain, max_pages=max_pages, timeout=timeout)
results.append((f"Docker Client w/ filter chain {idx}", docker_passed))
# Test 2: REST API
rest_passed = await test_with_rest_api()
results.append(("REST API", rest_passed))
max_pages_ = [20, 5, 5]
timeouts = [30, 60, 60]
filters_test_cases = [
[
{
"type": "URLPatternFilter",
"params": {
"patterns": ["*advanced*"],
"reverse": True
}
}
],
[
{
"type": "ContentRelevanceFilter",
"params": {
"query": "about faq",
"threshold": 0.2,
}
}
],
[
{
"type": "ContentRelevanceFilter",
"params": {
"query": ["about", "faq"],
"threshold": 0.2,
}
}
],
]
for idx, (filters, max_pages, timeout) in enumerate(zip(filters_test_cases, max_pages_, timeouts)):
rest_passed = await test_with_rest_api(filters=filters, max_pages=max_pages, timeout=timeout)
results.append((f"REST API w/ filters {idx}", rest_passed))
# Summary
print("\n" + "=" * 60)
@@ -186,10 +233,7 @@ async def main():
print("=" * 60)
if all_passed:
print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
print("\nThe fixes:")
print("1. Filter serialization: Fixed by not serializing private __slots__")
print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
print("🎉 ALL TESTS PASSED!")
else:
print("⚠️ Some tests failed. Please check the server logs for details.")
@@ -198,4 +242,4 @@ async def main():
if __name__ == "__main__":
import sys
sys.exit(asyncio.run(main()))
sys.exit(asyncio.run(main()))

View File

@@ -9,6 +9,21 @@ from crawl4ai import (
RateLimiter,
CacheMode
)
from crawl4ai.extraction_strategy import ExtractionStrategy
class MockExtractionStrategy(ExtractionStrategy):
"""Mock extraction strategy for testing URL parameter handling"""
def __init__(self):
super().__init__()
self.run_calls = []
def extract(self, url: str, html: str, *args, **kwargs):
return [{"test": "data"}]
def run(self, url: str, sections: List[str], *args, **kwargs):
self.run_calls.append(url)
return super().run(url, sections, *args, **kwargs)
@pytest.mark.asyncio
@pytest.mark.parametrize("viewport", [
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
assert not result.success
assert result.error_message is not None
@pytest.mark.asyncio
async def test_extraction_strategy_run_with_regular_url():
"""
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
This test verifies that when is_raw_html=False (regular URL),
extraction_strategy.run is called with the actual URL.
"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
mock_strategy = MockExtractionStrategy()
# Test regular URL (is_raw_html=False)
regular_url = "https://example.com"
result = await crawler.arun(
url=regular_url,
config=CrawlerRunConfig(
page_timeout=30000,
extraction_strategy=mock_strategy,
cache_mode=CacheMode.BYPASS
)
)
assert result.success
assert len(mock_strategy.run_calls) == 1
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
@pytest.mark.asyncio
async def test_extraction_strategy_run_with_raw_html():
"""
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
This test verifies that when is_raw_html=True (URL starts with "raw:"),
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
mock_strategy = MockExtractionStrategy()
# Test raw HTML URL (is_raw_html=True automatically set)
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
result = await crawler.arun(
url=raw_html_url,
config=CrawlerRunConfig(
page_timeout=30000,
extraction_strategy=mock_strategy,
cache_mode=CacheMode.BYPASS
)
)
assert result.success
assert len(mock_strategy.run_calls) == 1
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
if __name__ == "__main__":
asyncio.run(test_viewport_config((1024, 768)))
asyncio.run(test_memory_management())
asyncio.run(test_rate_limiting())
asyncio.run(test_javascript_execution())
asyncio.run(test_javascript_execution())
asyncio.run(test_extraction_strategy_run_with_regular_url())
asyncio.run(test_extraction_strategy_run_with_raw_html())