Release/v0.7.8 (#1662)

* Fix: Use correct URL variable for raw HTML extraction (#1116)

- Prevents full HTML content from being passed as URL to extraction strategies
- Added unit tests to verify raw HTML and regular URL processing

Fix: Wrong URL variable used for extraction of raw html

* Fix #1181: Preserve whitespace in code blocks during HTML scraping

  The remove_empty_elements_fast() method was removing whitespace-only
  span elements inside <pre> and <code> tags, causing import statements
  like "import torch" to become "importtorch". Now skips elements inside
  code blocks where whitespace is significant.

* Refactor Pydantic model configuration to use ConfigDict for arbitrary types

* Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621

* Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638

* fix: ensure BrowserConfig.to_dict serializes proxy_config

* feat: make LLM backoff configurable end-to-end

- extend LLMConfig with backoff delay/attempt/factor fields and thread them
  through LLMExtractionStrategy, LLMContentFilter, table extraction, and
  Docker API handlers
- expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff
  and document them in the md_v2 guides

* reproduced AttributeError from #1642

* pass timeout parameter to docker client request

* added missing deep crawling objects to init

* generalized query in ContentRelevanceFilter to be a str or list

* import modules from enhanceable deserialization

* parameterized tests

* Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268

* refactor: replace PyPDF2 with pypdf across the codebase. ref #1412

* announcement: add application form for cloud API closed beta

* Release v0.7.8: Stability & Bug Fix Release

- Updated version to 0.7.8
- Introduced focused stability release addressing 11 community-reported bugs.
- Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates.
- Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended.
- Updated documentation to reflect recent changes and improvements.

* docs: add section for Crawl4AI Cloud API closed beta with application link

* fix: add disk cleanup step to Docker workflow

---------

Co-authored-by: rbushria <rbushri@gmail.com>
Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com>
Co-authored-by: Soham Kukreti <kukretisoham@gmail.com>
Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com>
Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
This commit is contained in:
Nasrin
2025-12-11 18:04:52 +08:00
committed by GitHub
parent 835e3c56fe
commit a87e8c1c9e
32 changed files with 2123 additions and 135 deletions

View File

@@ -72,6 +72,8 @@ from .deep_crawling import (
BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy,
DeepCrawlDecorator,
ContentRelevanceFilter,
ContentTypeScorer,
)
# NEW: Import AsyncUrlSeeder
from .async_url_seeder import AsyncUrlSeeder

View File

@@ -1,7 +1,7 @@
# crawl4ai/__version__.py
# This is the version that will be used for stable releases
__version__ = "0.7.7"
__version__ = "0.7.8"
# For nightly builds, this gets set during build process
__nightly_version__ = None

View File

@@ -728,18 +728,18 @@ class EmbeddingStrategy(CrawlStrategy):
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
# response = perform_completion_with_backoff(
# provider=provider,
# prompt_with_variables=prompt,
# api_token=api_token,
# json_response=True
# )
response = perform_completion_with_backoff(
provider=provider,
prompt_with_variables=prompt,
api_token=api_token,
json_response=True
)
# variations = json.loads(response.choices[0].message.content)
variations = json.loads(response.choices[0].message.content)
# # Mock data with more variations for split
variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
# variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
# variations = {'queries': [

View File

@@ -1,5 +1,5 @@
import importlib
import os
from typing import Union
import warnings
import requests
from .config import (
@@ -27,14 +27,14 @@ from .table_extraction import TableExtractionStrategy, DefaultTableExtraction
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
from typing import Union, List, Callable
import inspect
from typing import Any, Dict, Optional
from typing import Any, Callable, Dict, List, Optional, Union
from enum import Enum
# Type alias for URL matching
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
class MatchMode(Enum):
OR = "or"
AND = "and"
@@ -42,8 +42,7 @@ class MatchMode(Enum):
# from .proxy_strategy import ProxyConfig
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
"""
Recursively convert an object to a serializable dictionary using {type, params} structure
for complex objects.
@@ -110,8 +109,6 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
# if value is not None:
# current_values[attr_name] = to_serializable_dict(value)
return {
"type": obj.__class__.__name__,
"params": current_values
@@ -137,12 +134,20 @@ def from_serializable_dict(data: Any) -> Any:
if data["type"] == "dict" and "value" in data:
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
# Import from crawl4ai for class instances
import crawl4ai
if hasattr(crawl4ai, data["type"]):
cls = getattr(crawl4ai, data["type"])
cls = None
# If you are receiving an error while trying to convert a dict to an object:
# Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file
module_paths = ["crawl4ai"]
for module_path in module_paths:
try:
mod = importlib.import_module(module_path)
if hasattr(mod, data["type"]):
cls = getattr(mod, data["type"])
break
except (ImportError, AttributeError):
continue
if cls is not None:
# Handle Enum
if issubclass(cls, Enum):
return cls(data["params"])
@@ -598,7 +603,7 @@ class BrowserConfig:
"chrome_channel": self.chrome_channel,
"channel": self.channel,
"proxy": self.proxy,
"proxy_config": self.proxy_config,
"proxy_config": self.proxy_config.to_dict() if self.proxy_config else None,
"viewport_width": self.viewport_width,
"viewport_height": self.viewport_height,
"accept_downloads": self.accept_downloads,
@@ -1792,7 +1797,10 @@ class LLMConfig:
frequency_penalty: Optional[float] = None,
presence_penalty: Optional[float] = None,
stop: Optional[List[str]] = None,
n: Optional[int] = None,
n: Optional[int] = None,
backoff_base_delay: Optional[int] = None,
backoff_max_attempts: Optional[int] = None,
backoff_exponential_factor: Optional[int] = None,
):
"""Configuaration class for LLM provider and API token."""
self.provider = provider
@@ -1821,6 +1829,9 @@ class LLMConfig:
self.presence_penalty = presence_penalty
self.stop = stop
self.n = n
self.backoff_base_delay = backoff_base_delay if backoff_base_delay is not None else 2
self.backoff_max_attempts = backoff_max_attempts if backoff_max_attempts is not None else 3
self.backoff_exponential_factor = backoff_exponential_factor if backoff_exponential_factor is not None else 2
@staticmethod
def from_kwargs(kwargs: dict) -> "LLMConfig":
@@ -1834,7 +1845,10 @@ class LLMConfig:
frequency_penalty=kwargs.get("frequency_penalty"),
presence_penalty=kwargs.get("presence_penalty"),
stop=kwargs.get("stop"),
n=kwargs.get("n")
n=kwargs.get("n"),
backoff_base_delay=kwargs.get("backoff_base_delay"),
backoff_max_attempts=kwargs.get("backoff_max_attempts"),
backoff_exponential_factor=kwargs.get("backoff_exponential_factor")
)
def to_dict(self):
@@ -1848,7 +1862,10 @@ class LLMConfig:
"frequency_penalty": self.frequency_penalty,
"presence_penalty": self.presence_penalty,
"stop": self.stop,
"n": self.n
"n": self.n,
"backoff_base_delay": self.backoff_base_delay,
"backoff_max_attempts": self.backoff_max_attempts,
"backoff_exponential_factor": self.backoff_exponential_factor
}
def clone(self, **kwargs):

View File

@@ -1023,6 +1023,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
final_messages = await self.adapter.retrieve_console_messages(page)
captured_console.extend(final_messages)
###
# This ensures we capture the current page URL at the time we return the response,
# which correctly reflects any JavaScript navigation that occurred.
###
redirected_url = page.url # Use current page URL to capture JS redirects
# Return complete response
return AsyncCrawlResponse(
html=html,

View File

@@ -617,17 +617,17 @@ class AsyncWebCrawler:
else config.chunking_strategy
)
sections = chunking.chunk(content)
# extracted_content = config.extraction_strategy.run(url, sections)
# extracted_content = config.extraction_strategy.run(_url, sections)
# Use async version if available for better parallelism
if hasattr(config.extraction_strategy, 'arun'):
extracted_content = await config.extraction_strategy.arun(url, sections)
extracted_content = await config.extraction_strategy.arun(_url, sections)
else:
# Fallback to sync version run in thread pool to avoid blocking
extracted_content = await asyncio.to_thread(
config.extraction_strategy.run, url, sections
)
extracted_content = json.dumps(
extracted_content, indent=4, default=str, ensure_ascii=False
)

View File

@@ -980,6 +980,9 @@ class LLMContentFilter(RelevantContentFilter):
prompt,
api_token,
base_url=base_url,
base_delay=self.llm_config.backoff_base_delay,
max_attempts=self.llm_config.backoff_max_attempts,
exponential_factor=self.llm_config.backoff_exponential_factor,
extra_args=extra_args,
)

View File

@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
if el.tag in bypass_tags:
continue
# Skip elements inside <pre> or <code> tags where whitespace is significant
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
is_in_code_block = False
ancestor = el.getparent()
while ancestor is not None:
if ancestor.tag in ("pre", "code"):
is_in_code_block = True
break
ancestor = ancestor.getparent()
if is_in_code_block:
continue
text_content = (el.text_content() or "").strip()
if (
len(text_content.split()) < word_count_threshold

View File

@@ -509,18 +509,22 @@ class DomainFilter(URLFilter):
class ContentRelevanceFilter(URLFilter):
"""BM25-based relevance filter using head section content"""
__slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
__slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query")
def __init__(
self,
query: str,
query: Union[str, List[str]],
threshold: float,
k1: float = 1.2,
b: float = 0.75,
avgdl: int = 1000,
):
super().__init__(name="BM25RelevanceFilter")
self.query_terms = self._tokenize(query)
if isinstance(query, list):
self.query = " ".join(query)
else:
self.query = query
self.query_terms = self._tokenize(self.query)
self.threshold = threshold
self.k1 = k1 # TF saturation parameter
self.b = b # Length normalization parameter

View File

@@ -180,7 +180,7 @@ class Crawl4aiDockerClient:
yield CrawlResult(**result)
return stream_results()
response = await self._request("POST", "/crawl", json=data)
response = await self._request("POST", "/crawl", json=data, timeout=hooks_timeout)
result_data = response.json()
if not result_data.get("success", False):
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")

View File

@@ -649,6 +649,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
base_url=self.llm_config.base_url,
json_response=self.force_json_response,
extra_args=self.extra_args,
base_delay=self.llm_config.backoff_base_delay,
max_attempts=self.llm_config.backoff_max_attempts,
exponential_factor=self.llm_config.backoff_exponential_factor
) # , json_response=self.extract_type == "schema")
# Track usage
usage = TokenUsage(
@@ -846,6 +849,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
base_url=self.llm_config.base_url,
json_response=self.force_json_response,
extra_args=self.extra_args,
base_delay=self.llm_config.backoff_base_delay,
max_attempts=self.llm_config.backoff_max_attempts,
exponential_factor=self.llm_config.backoff_exponential_factor
)
# Track usage
usage = TokenUsage(

View File

@@ -1,4 +1,4 @@
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from typing import AsyncGenerator
from typing import Generic, TypeVar
@@ -153,8 +153,7 @@ class CrawlResult(BaseModel):
console_messages: Optional[List[Dict[str, Any]]] = None
tables: List[Dict] = Field(default_factory=list) # NEW [{headers,rows,caption,summary}]
class Config:
arbitrary_types_allowed = True
model_config = ConfigDict(arbitrary_types_allowed=True)
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
# and model_dump override all exist to support a smooth transition from markdown as a string
@@ -332,8 +331,7 @@ class AsyncCrawlResponse(BaseModel):
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
class Config:
arbitrary_types_allowed = True
model_config = ConfigDict(arbitrary_types_allowed=True)
###############################
# Scraping Models

View File

@@ -15,9 +15,9 @@ from .utils import (
clean_pdf_text_to_html,
)
# Remove direct PyPDF2 imports from the top
# import PyPDF2
# from PyPDF2 import PdfReader
# Remove direct pypdf imports from the top
# import pypdf
# from pypdf import PdfReader
logger = logging.getLogger(__name__)
@@ -59,9 +59,9 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
# Import check at initialization time
try:
import PyPDF2
import pypdf
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
self.image_dpi = image_dpi
self.image_quality = image_quality
@@ -75,9 +75,9 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
def process(self, pdf_path: Path) -> PDFProcessResult:
# Import inside method to allow dependency to be optional
try:
from PyPDF2 import PdfReader
from pypdf import PdfReader
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
start_time = time()
result = PDFProcessResult(
@@ -125,15 +125,15 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
"""Like process() but processes PDF pages in parallel batches"""
# Import inside method to allow dependency to be optional
try:
from PyPDF2 import PdfReader
import PyPDF2 # For type checking
from pypdf import PdfReader
import pypdf # For type checking
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
import concurrent.futures
import threading
# Initialize PyPDF2 thread support
# Initialize pypdf thread support
if not hasattr(threading.current_thread(), "_children"):
threading.current_thread()._children = set()
@@ -232,11 +232,11 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
return pdf_page
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
# Import PyPDF2 for type checking only when needed
# Import pypdf for type checking only when needed
try:
import PyPDF2
from pypdf.generic import IndirectObject
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
if not self.extract_images:
return []
@@ -266,7 +266,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
width = xobj.get('/Width', 0)
height = xobj.get('/Height', 0)
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
if isinstance(color_space, PyPDF2.generic.IndirectObject):
if isinstance(color_space, IndirectObject):
color_space = color_space.get_object()
# Handle different image encodings
@@ -277,7 +277,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
if '/FlateDecode' in filters:
try:
decode_parms = xobj.get('/DecodeParms', {})
if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
if isinstance(decode_parms, IndirectObject):
decode_parms = decode_parms.get_object()
predictor = decode_parms.get('/Predictor', 1)
@@ -416,10 +416,10 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
# Import inside method to allow dependency to be optional
if reader is None:
try:
from PyPDF2 import PdfReader
from pypdf import PdfReader
reader = PdfReader(pdf_path)
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
meta = reader.metadata or {}
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
@@ -459,11 +459,11 @@ if __name__ == "__main__":
from pathlib import Path
try:
# Import PyPDF2 only when running the file directly
import PyPDF2
from PyPDF2 import PdfReader
# Import pypdf only when running the file directly
import pypdf
from pypdf import PdfReader
except ImportError:
print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
print("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
exit(1)
current_dir = Path(__file__).resolve().parent

View File

@@ -795,6 +795,9 @@ Return only a JSON array of extracted tables following the specified format."""
api_token=self.llm_config.api_token,
base_url=self.llm_config.base_url,
json_response=True,
base_delay=self.llm_config.backoff_base_delay,
max_attempts=self.llm_config.backoff_max_attempts,
exponential_factor=self.llm_config.backoff_exponential_factor,
extra_args=self.extra_args
)
@@ -1116,6 +1119,9 @@ Return only a JSON array of extracted tables following the specified format."""
api_token=self.llm_config.api_token,
base_url=self.llm_config.base_url,
json_response=True,
base_delay=self.llm_config.backoff_base_delay,
max_attempts=self.llm_config.backoff_max_attempts,
exponential_factor=self.llm_config.backoff_exponential_factor,
extra_args=self.extra_args
)

View File

@@ -1745,6 +1745,9 @@ def perform_completion_with_backoff(
api_token,
json_response=False,
base_url=None,
base_delay=2,
max_attempts=3,
exponential_factor=2,
**kwargs,
):
"""
@@ -1761,6 +1764,9 @@ def perform_completion_with_backoff(
api_token (str): The API token for authentication.
json_response (bool): Whether to request a JSON response. Defaults to False.
base_url (Optional[str]): The base URL for the API. Defaults to None.
base_delay (int): The base delay in seconds. Defaults to 2.
max_attempts (int): The maximum number of attempts. Defaults to 3.
exponential_factor (int): The exponential factor. Defaults to 2.
**kwargs: Additional arguments for the API request.
Returns:
@@ -1770,9 +1776,6 @@ def perform_completion_with_backoff(
from litellm import completion
from litellm.exceptions import RateLimitError
max_attempts = 3
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
if json_response:
extra_args["response_format"] = {"type": "json_object"}
@@ -1798,7 +1801,7 @@ def perform_completion_with_backoff(
# Check if we have exhausted our max attempts
if attempt < max_attempts - 1:
# Calculate the delay and wait
delay = base_delay * (2**attempt) # Exponential backoff formula
delay = base_delay * (exponential_factor**attempt) # Exponential backoff formula
print(f"Waiting for {delay} seconds before retrying...")
time.sleep(delay)
else:
@@ -1831,6 +1834,9 @@ async def aperform_completion_with_backoff(
api_token,
json_response=False,
base_url=None,
base_delay=2,
max_attempts=3,
exponential_factor=2,
**kwargs,
):
"""
@@ -1847,6 +1853,9 @@ async def aperform_completion_with_backoff(
api_token (str): The API token for authentication.
json_response (bool): Whether to request a JSON response. Defaults to False.
base_url (Optional[str]): The base URL for the API. Defaults to None.
base_delay (int): The base delay in seconds. Defaults to 2.
max_attempts (int): The maximum number of attempts. Defaults to 3.
exponential_factor (int): The exponential factor. Defaults to 2.
**kwargs: Additional arguments for the API request.
Returns:
@@ -1857,9 +1866,6 @@ async def aperform_completion_with_backoff(
from litellm.exceptions import RateLimitError
import asyncio
max_attempts = 3
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
if json_response:
extra_args["response_format"] = {"type": "json_object"}
@@ -1885,7 +1891,7 @@ async def aperform_completion_with_backoff(
# Check if we have exhausted our max attempts
if attempt < max_attempts - 1:
# Calculate the delay and wait
delay = base_delay * (2**attempt) # Exponential backoff formula
delay = base_delay * (exponential_factor**attempt) # Exponential backoff formula
print(f"Waiting for {delay} seconds before retrying...")
await asyncio.sleep(delay)
else: