Release prep (#749)

* fix: Update export of URLPatternFilter

* chore: Add dependancy for cchardet in requirements

* docs: Update example for deep crawl in release note for v0.5

* Docs: update the example for memory dispatcher

* docs: updated example for crawl strategies

* Refactor: Removed wrapping in if __name__==main block since this is a markdown file.

* chore: removed cchardet from dependancy list, since unclecode is planning to remove it

* docs: updated the example for proxy rotation to a working example

* feat: Introduced ProxyConfig param

* Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1

* chore: update and test new dependancies

* feat:Make PyPDF2 a conditional dependancy

* updated tutorial and release note for v0.5

* docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename

* refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult

* fix: Bug in serialisation of markdown in acache_url

* Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown

* fix: remove deprecated markdown_v2 from docker

* Refactor: remove deprecated fit_markdown and fit_html from result

* refactor: fix cache retrieval for markdown as a string

* chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
This commit is contained in:
Aravind
2025-02-28 17:23:35 +05:30
committed by GitHub
parent 3a87b4e43b
commit a9e24307cc
38 changed files with 2040 additions and 326 deletions

View File

@@ -7,7 +7,7 @@ from contextlib import asynccontextmanager
import logging
import json # Added for serialization/deserialization
from .utils import ensure_content_dirs, generate_content_hash
from .models import CrawlResult, MarkdownGenerationResult
from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
import aiofiles
from .utils import VersionManager
from .async_logger import AsyncLogger
@@ -336,12 +336,17 @@ class AsyncDatabaseManager:
except json.JSONDecodeError:
# Very UGLY, never mention it to me please
if field == "markdown" and isinstance(row_dict[field], str):
row_dict[field] = row_dict[field]
row_dict[field] = MarkdownGenerationResult(
raw_markdown=row_dict[field] or "",
markdown_with_citations="",
references_markdown="",
fit_markdown="",
fit_html="",
)
else:
row_dict[field] = {}
if isinstance(row_dict["markdown"], Dict):
row_dict["markdown_v2"] = row_dict["markdown"]
if row_dict["markdown"].get("raw_markdown"):
row_dict["markdown"] = row_dict["markdown"]["raw_markdown"]
@@ -358,7 +363,7 @@ class AsyncDatabaseManager:
# Remove any fields not in CrawlResult model
valid_fields = CrawlResult.__annotations__.keys()
filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
filtered_dict["markdown"] = row_dict["markdown"]
return CrawlResult(**filtered_dict)
try:
@@ -384,14 +389,14 @@ class AsyncDatabaseManager:
}
try:
if isinstance(result.markdown, MarkdownGenerationResult):
if isinstance(result.markdown, StringCompatibleMarkdown):
content_map["markdown"] = (
result.markdown.model_dump_json(),
result.markdown,
"markdown",
)
elif hasattr(result, "markdown_v2"):
elif isinstance(result.markdown, MarkdownGenerationResult):
content_map["markdown"] = (
result.markdown_v2.model_dump_json(),
result.markdown.model_dump_json(),
"markdown",
)
elif isinstance(result.markdown, str):

View File

@@ -362,7 +362,7 @@ class AsyncWebCrawler:
self.logger.info(
message="Switch proxy: {proxy}",
tag="PROXY",
params={"proxy": next_proxy.get("server")},
params={"proxy": next_proxy.server},
)
config.proxy_config = next_proxy
# config = config.clone(proxy_config=next_proxy)
@@ -581,8 +581,6 @@ class AsyncWebCrawler:
# html2text_options=kwargs.get('html2text', {})
)
)
markdown_v2 = markdown_result
markdown = sanitize_input_encode(markdown_result.raw_markdown)
# Log processing completion
self.logger.info(
@@ -611,11 +609,11 @@ class AsyncWebCrawler:
content_format = "markdown"
content = {
"markdown": markdown,
"markdown": markdown_result.raw_markdown,
"html": html,
"cleaned_html": cleaned_html,
"fit_markdown": markdown_result.raw_markdown,
}.get(content_format, markdown)
"fit_markdown": markdown_result.fit_markdown,
}.get(content_format, markdown_result.raw_markdown)
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
chunking = (
@@ -649,10 +647,7 @@ class AsyncWebCrawler:
url=url,
html=html,
cleaned_html=cleaned_html,
markdown_v2=markdown_v2,
markdown=markdown,
fit_markdown=markdown_result.fit_markdown,
fit_html=markdown_result.fit_html,
markdown=markdown_result,
media=media,
links=links,
metadata=metadata,

View File

@@ -633,12 +633,12 @@ class BrowserManager:
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
if crawlerRunConfig.proxy_config:
proxy_settings = {
"server": crawlerRunConfig.proxy_config.get("server"),
"server": crawlerRunConfig.proxy_config.server,
}
if crawlerRunConfig.proxy_config.get("username"):
if crawlerRunConfig.proxy_config.username:
proxy_settings.update({
"username": crawlerRunConfig.proxy_config.get("username"),
"password": crawlerRunConfig.proxy_config.get("password"),
"username": crawlerRunConfig.proxy_config.username,
"password": crawlerRunConfig.proxy_config.password,
})
context_settings["proxy"] = proxy_settings

View File

@@ -19,6 +19,8 @@ from crawl4ai import (
from litellm import completion
from pathlib import Path
from crawl4ai.async_configs import LlmConfig
def get_global_config() -> dict:
config_dir = Path.home() / ".crawl4ai"
config_file = config_dir / "global.yml"
@@ -288,7 +290,7 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "markdown-v2", "md", "md-fit"]), default="all")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@@ -351,9 +353,8 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
raise click.ClickException("LLM provider and API token are required for LLM extraction")
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
provider=extract_conf["provider"],
llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
instruction=extract_conf["instruction"],
api_token=extract_conf.get("api_token", extract_conf.get("api_key")),
schema=schema_data,
**extract_conf.get("params", {})
)
@@ -383,7 +384,7 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
# Handle question
if question:
provider, token = setup_llm_config()
markdown = result.markdown_v2.raw_markdown
markdown = result.markdown.raw_markdown
anyio.run(stream_llm_response, url, markdown, question, provider, token)
return
@@ -393,9 +394,9 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
elif output == "json":
click.echo(json.dumps(json.loads(result.extracted_content), indent=2))
elif output in ["markdown", "md"]:
click.echo(result.markdown_v2.raw_markdown)
click.echo(result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
click.echo(result.markdown_v2.fit_markdown)
click.echo(result.markdown.fit_markdown)
except Exception as e:
raise click.ClickException(str(e))

View File

@@ -0,0 +1,2 @@
from .proxy_config import ProxyConfig
__all__ = ["ProxyConfig"]

View File

@@ -0,0 +1,113 @@
import os
from typing import Dict, List, Optional
class ProxyConfig:
def __init__(
self,
server: str,
username: Optional[str] = None,
password: Optional[str] = None,
ip: Optional[str] = None,
):
"""Configuration class for a single proxy.
Args:
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
username: Optional username for proxy authentication
password: Optional password for proxy authentication
ip: Optional IP address for verification purposes
"""
self.server = server
self.username = username
self.password = password
# Extract IP from server if not explicitly provided
self.ip = ip or self._extract_ip_from_server()
def _extract_ip_from_server(self) -> Optional[str]:
"""Extract IP address from server URL."""
try:
# Simple extraction assuming http://ip:port format
if "://" in self.server:
parts = self.server.split("://")[1].split(":")
return parts[0]
else:
parts = self.server.split(":")
return parts[0]
except Exception:
return None
@staticmethod
def from_string(proxy_str: str) -> "ProxyConfig":
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
parts = proxy_str.split(":")
if len(parts) == 4: # ip:port:username:password
ip, port, username, password = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
username=username,
password=password,
ip=ip
)
elif len(parts) == 2: # ip:port only
ip, port = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
ip=ip
)
else:
raise ValueError(f"Invalid proxy string format: {proxy_str}")
@staticmethod
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
"""Create a ProxyConfig from a dictionary."""
return ProxyConfig(
server=proxy_dict.get("server"),
username=proxy_dict.get("username"),
password=proxy_dict.get("password"),
ip=proxy_dict.get("ip")
)
@staticmethod
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
"""Load proxies from environment variable.
Args:
env_var: Name of environment variable containing comma-separated proxy strings
Returns:
List of ProxyConfig objects
"""
proxies = []
try:
proxy_list = os.getenv(env_var, "").split(",")
for proxy in proxy_list:
if not proxy:
continue
proxies.append(ProxyConfig.from_string(proxy))
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
def to_dict(self) -> Dict:
"""Convert to dictionary representation."""
return {
"server": self.server,
"username": self.username,
"password": self.password,
"ip": self.ip
}
def clone(self, **kwargs) -> "ProxyConfig":
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
ProxyConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return ProxyConfig.from_dict(config_dict)

View File

@@ -8,6 +8,7 @@ from .filters import (
ContentTypeFilter,
DomainFilter,
URLFilter,
URLPatternFilter,
FilterStats,
ContentRelevanceFilter,
SEOFilter
@@ -32,6 +33,7 @@ __all__ = [
"ContentTypeFilter",
"DomainFilter",
"URLFilter",
"URLPatternFilter",
"FilterStats",
"ContentRelevanceFilter",
"SEOFilter",

View File

@@ -1,5 +1,5 @@
from re import U
from pydantic import BaseModel, HttpUrl
from pydantic import BaseModel, HttpUrl, PrivateAttr
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from enum import Enum
from dataclasses import dataclass
@@ -86,6 +86,9 @@ class MarkdownGenerationResult(BaseModel):
fit_markdown: Optional[str] = None
fit_html: Optional[str] = None
def __str__(self):
return self.raw_markdown
@dataclass
class TraversalStats:
"""Statistics for the traversal process"""
@@ -105,7 +108,6 @@ class DispatchResult(BaseModel):
end_time: Union[datetime, float]
error_message: str = ""
class CrawlResult(BaseModel):
url: str
html: str
@@ -117,10 +119,7 @@ class CrawlResult(BaseModel):
js_execution_result: Optional[Dict[str, Any]] = None
screenshot: Optional[str] = None
pdf: Optional[bytes] = None
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
markdown_v2: Optional[MarkdownGenerationResult] = None
fit_markdown: Optional[str] = None
fit_html: Optional[str] = None
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
error_message: Optional[str] = None
@@ -134,6 +133,118 @@ class CrawlResult(BaseModel):
class Config:
arbitrary_types_allowed = True
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
# and model_dump override all exist to support a smooth transition from markdown as a string
# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
#
# This allows code that expects markdown to be a string to continue working, while also
# providing access to the full MarkdownGenerationResult object's properties.
#
# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
#
# When backward compatibility is no longer needed in future versions, this entire mechanism
# can be simplified to a standard field with no custom accessors or serialization logic.
def __init__(self, **data):
markdown_result = data.pop('markdown', None)
super().__init__(**data)
if markdown_result is not None:
self._markdown = markdown_result
@property
def markdown(self):
"""
Property that returns a StringCompatibleMarkdown object that behaves like
a string but also provides access to MarkdownGenerationResult attributes.
This approach allows backward compatibility with code that expects 'markdown'
to be a string, while providing access to the full MarkdownGenerationResult.
"""
if self._markdown is None:
return None
return StringCompatibleMarkdown(self._markdown)
@markdown.setter
def markdown(self, value):
"""
Setter for the markdown property.
"""
self._markdown = value
@property
def markdown_v2(self):
"""
Deprecated property that raises an AttributeError when accessed.
This property exists to inform users that 'markdown_v2' has been
deprecated and they should use 'markdown' instead.
"""
raise AttributeError(
"The 'markdown_v2' attribute is deprecated and has been removed. "
"""Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
following properties:
- raw_markdown: The raw markdown string
- markdown_with_citations: The markdown string with citations
- references_markdown: The markdown string with references
- fit_markdown: The markdown string with fit text
"""
)
@property
def fit_markdown(self):
"""
Deprecated property that raises an AttributeError when accessed.
"""
raise AttributeError(
"The 'fit_markdown' attribute is deprecated and has been removed. "
"Please use 'markdown.fit_markdown' instead."
)
@property
def fit_html(self):
"""
Deprecated property that raises an AttributeError when accessed.
"""
raise AttributeError(
"The 'fit_html' attribute is deprecated and has been removed. "
"Please use 'markdown.fit_html' instead."
)
def model_dump(self, *args, **kwargs):
"""
Override model_dump to include the _markdown private attribute in serialization.
This override is necessary because:
1. PrivateAttr fields are excluded from serialization by default
2. We need to maintain backward compatibility by including the 'markdown' field
in the serialized output
3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
the same type of data
Future developers: This method ensures that the markdown content is properly
serialized despite being stored in a private attribute. If the serialization
requirements change, this is where you would update the logic.
"""
result = super().model_dump(*args, **kwargs)
if self._markdown is not None:
result["markdown"] = self._markdown.model_dump()
return result
class StringCompatibleMarkdown(str):
"""A string subclass that also provides access to MarkdownGenerationResult attributes"""
def __new__(cls, markdown_result):
return super().__new__(cls, markdown_result.raw_markdown)
def __init__(self, markdown_result):
self._markdown_result = markdown_result
def __getattr__(self, name):
return getattr(self._markdown_result, name)
# END of backward compatibility code for markdown/markdown_v2.
# When removing this code in the future, make sure to:
# 1. Replace the private attribute and property with a standard field
# 2. Update any serialization logic that might depend on the current behavior
class AsyncCrawlResponse(BaseModel):
html: str

View File

@@ -484,4 +484,4 @@ if __name__ == "__main__":
for page in result.pages:
f.write(f'# Page {page["page_number"]}\n\n')
f.write(clean_pdf_text(page["page_number"], page['raw_text']))
f.write('\n\n')
f.write('\n\n')

View File

@@ -2,6 +2,8 @@ from typing import List, Dict, Optional
from abc import ABC, abstractmethod
from itertools import cycle
from crawl4ai.configs import ProxyConfig
class ProxyRotationStrategy(ABC):
"""Base abstract class for proxy rotation strategies"""
@@ -15,28 +17,27 @@ class ProxyRotationStrategy(ABC):
"""Add proxy configurations to the strategy"""
pass
class RoundRobinProxyStrategy(ProxyRotationStrategy):
"""Simple round-robin proxy rotation strategy"""
class RoundRobinProxyStrategy:
"""Simple round-robin proxy rotation strategy using ProxyConfig objects"""
def __init__(self, proxies: List[Dict] = None):
def __init__(self, proxies: List[ProxyConfig] = None):
"""
Initialize with optional list of proxy configurations
Args:
proxies: List of proxy config dictionaries, each containing at least
'server' key with proxy URL
proxies: List of ProxyConfig objects
"""
self._proxies = []
self._proxy_cycle = None
if proxies:
self.add_proxies(proxies)
def add_proxies(self, proxies: List[Dict]):
def add_proxies(self, proxies: List[ProxyConfig]):
"""Add new proxies to the rotation pool"""
self._proxies.extend(proxies)
self._proxy_cycle = cycle(self._proxies)
async def get_next_proxy(self) -> Optional[Dict]:
async def get_next_proxy(self) -> Optional[ProxyConfig]:
"""Get next proxy in round-robin fashion"""
if not self._proxy_cycle:
return None