Release prep (#749)

* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
parent 3a87b4e43b
commit a9e24307cc
38 changed files with 2040 additions and 326 deletions
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -7,7 +7,7 @@ from contextlib import asynccontextmanager
 import logging
 import json  # Added for serialization/deserialization
 from .utils import ensure_content_dirs, generate_content_hash
-from .models import CrawlResult, MarkdownGenerationResult
+from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
 import aiofiles
 from .utils import VersionManager
 from .async_logger import AsyncLogger
@@ -336,12 +336,17 @@ class AsyncDatabaseManager:
                    except json.JSONDecodeError:
                        # Very UGLY, never mention it to me please
                        if field == "markdown" and isinstance(row_dict[field], str):
-                            row_dict[field] = row_dict[field]
+                            row_dict[field] = MarkdownGenerationResult(
+                                raw_markdown=row_dict[field] or "",
+                                markdown_with_citations="",
+                                references_markdown="",
+                                fit_markdown="",
+                                fit_html="",
+                            )
                        else:
                            row_dict[field] = {}

                if isinstance(row_dict["markdown"], Dict):
-                    row_dict["markdown_v2"] = row_dict["markdown"]
                    if row_dict["markdown"].get("raw_markdown"):
                        row_dict["markdown"] = row_dict["markdown"]["raw_markdown"]

@@ -358,7 +363,7 @@ class AsyncDatabaseManager:
                # Remove any fields not in CrawlResult model
                valid_fields = CrawlResult.__annotations__.keys()
                filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
-
+                filtered_dict["markdown"] = row_dict["markdown"]
                return CrawlResult(**filtered_dict)

        try:
@@ -384,14 +389,14 @@ class AsyncDatabaseManager:
        }

        try:
-            if isinstance(result.markdown, MarkdownGenerationResult):
+            if isinstance(result.markdown, StringCompatibleMarkdown):
                content_map["markdown"] = (
-                    result.markdown.model_dump_json(),
+                    result.markdown,
                    "markdown",
                )
-            elif hasattr(result, "markdown_v2"):
+            elif isinstance(result.markdown, MarkdownGenerationResult):
                content_map["markdown"] = (
-                    result.markdown_v2.model_dump_json(),
+                    result.markdown.model_dump_json(),
                    "markdown",
                )
            elif isinstance(result.markdown, str):
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -362,7 +362,7 @@ class AsyncWebCrawler:
                            self.logger.info(
                                message="Switch proxy: {proxy}",
                                tag="PROXY",
-                                params={"proxy": next_proxy.get("server")},
+                                params={"proxy": next_proxy.server},
                            )
                        config.proxy_config = next_proxy
                        # config = config.clone(proxy_config=next_proxy)
@@ -581,8 +581,6 @@ class AsyncWebCrawler:
                # html2text_options=kwargs.get('html2text', {})
            )
        )
-        markdown_v2 = markdown_result
-        markdown = sanitize_input_encode(markdown_result.raw_markdown)

        # Log processing completion
        self.logger.info(
@@ -611,11 +609,11 @@ class AsyncWebCrawler:
                content_format = "markdown"

            content = {
-                "markdown": markdown,
+                "markdown": markdown_result.raw_markdown,
                "html": html,
                "cleaned_html": cleaned_html,
-                "fit_markdown": markdown_result.raw_markdown,
-            }.get(content_format, markdown)
+                "fit_markdown": markdown_result.fit_markdown,
+            }.get(content_format, markdown_result.raw_markdown)

            # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
            chunking = (
@@ -649,10 +647,7 @@ class AsyncWebCrawler:
            url=url,
            html=html,
            cleaned_html=cleaned_html,
-            markdown_v2=markdown_v2,
-            markdown=markdown,
-            fit_markdown=markdown_result.fit_markdown,
-            fit_html=markdown_result.fit_html,
+            markdown=markdown_result,
            media=media,
            links=links,
            metadata=metadata,
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -633,12 +633,12 @@ class BrowserManager:
            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
            if crawlerRunConfig.proxy_config:
                proxy_settings = {
-                    "server": crawlerRunConfig.proxy_config.get("server"),
+                    "server": crawlerRunConfig.proxy_config.server,
                }
-                if crawlerRunConfig.proxy_config.get("username"):
+                if crawlerRunConfig.proxy_config.username:
                    proxy_settings.update({
-                        "username": crawlerRunConfig.proxy_config.get("username"),
-                        "password": crawlerRunConfig.proxy_config.get("password"),
+                        "username": crawlerRunConfig.proxy_config.username,
+                        "password": crawlerRunConfig.proxy_config.password,
                    })
                context_settings["proxy"] = proxy_settings

--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -19,6 +19,8 @@ from crawl4ai import (
 from litellm import completion
 from pathlib import Path

+from crawl4ai.async_configs import LlmConfig
+
 def get_global_config() -> dict:
    config_dir = Path.home() / ".crawl4ai"
    config_file = config_dir / "global.yml"
@@ -288,7 +290,7 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
-@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "markdown-v2", "md", "md-fit"]), default="all")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--bypass-cache", is_flag=True, default = True,  help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@@ -351,9 +353,8 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
                    raise click.ClickException("LLM provider and API token are required for LLM extraction")

                crawler_cfg.extraction_strategy = LLMExtractionStrategy(
-                    provider=extract_conf["provider"],
+                    llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
                    instruction=extract_conf["instruction"],
-                    api_token=extract_conf.get("api_token", extract_conf.get("api_key")),
                    schema=schema_data,
                    **extract_conf.get("params", {})
                )
@@ -383,7 +384,7 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
        # Handle question
        if question:
            provider, token = setup_llm_config()
-            markdown = result.markdown_v2.raw_markdown
+            markdown = result.markdown.raw_markdown
            anyio.run(stream_llm_response, url, markdown, question, provider, token)
            return
        
@@ -393,9 +394,9 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte
        elif output == "json":
            click.echo(json.dumps(json.loads(result.extracted_content), indent=2))
        elif output in ["markdown", "md"]:
-            click.echo(result.markdown_v2.raw_markdown)
+            click.echo(result.markdown.raw_markdown)
        elif output in ["markdown-fit", "md-fit"]:
-            click.echo(result.markdown_v2.fit_markdown)
+            click.echo(result.markdown.fit_markdown)
            
    except Exception as e:
        raise click.ClickException(str(e))
--- a/crawl4ai/configs/init.py
+++ b/crawl4ai/configs/init.py
@@ -0,0 +1,2 @@
+from .proxy_config import ProxyConfig
+__all__ = ["ProxyConfig"]
--- a/crawl4ai/configs/proxy_config.py
+++ b/crawl4ai/configs/proxy_config.py
@@ -0,0 +1,113 @@
+import os
+from typing import Dict, List, Optional
+
+
+class ProxyConfig:
+    def __init__(
+        self,
+        server: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        ip: Optional[str] = None,
+    ):
+        """Configuration class for a single proxy.
+        
+        Args:
+            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+            username: Optional username for proxy authentication
+            password: Optional password for proxy authentication
+            ip: Optional IP address for verification purposes
+        """
+        self.server = server
+        self.username = username
+        self.password = password
+        
+        # Extract IP from server if not explicitly provided
+        self.ip = ip or self._extract_ip_from_server()
+    
+    def _extract_ip_from_server(self) -> Optional[str]:
+        """Extract IP address from server URL."""
+        try:
+            # Simple extraction assuming http://ip:port format
+            if "://" in self.server:
+                parts = self.server.split("://")[1].split(":")
+                return parts[0]
+            else:
+                parts = self.server.split(":")
+                return parts[0]
+        except Exception:
+            return None
+    
+    @staticmethod
+    def from_string(proxy_str: str) -> "ProxyConfig":
+        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+        parts = proxy_str.split(":")
+        if len(parts) == 4:  # ip:port:username:password
+            ip, port, username, password = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                username=username,
+                password=password,
+                ip=ip
+            )
+        elif len(parts) == 2:  # ip:port only
+            ip, port = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                ip=ip
+            )
+        else:
+            raise ValueError(f"Invalid proxy string format: {proxy_str}")
+    
+    @staticmethod
+    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+        """Create a ProxyConfig from a dictionary."""
+        return ProxyConfig(
+            server=proxy_dict.get("server"),
+            username=proxy_dict.get("username"),
+            password=proxy_dict.get("password"),
+            ip=proxy_dict.get("ip")
+        )
+    
+    @staticmethod
+    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+        """Load proxies from environment variable.
+        
+        Args:
+            env_var: Name of environment variable containing comma-separated proxy strings
+            
+        Returns:
+            List of ProxyConfig objects
+        """
+        proxies = []
+        try:
+            proxy_list = os.getenv(env_var, "").split(",")
+            for proxy in proxy_list:
+                if not proxy:
+                    continue
+                proxies.append(ProxyConfig.from_string(proxy))
+        except Exception as e:
+            print(f"Error loading proxies from environment: {e}")
+        return proxies
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "server": self.server,
+            "username": self.username,
+            "password": self.password,
+            "ip": self.ip
+        }
+    
+    def clone(self, **kwargs) -> "ProxyConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            ProxyConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return ProxyConfig.from_dict(config_dict)
--- a/crawl4ai/deep_crawling/init.py
+++ b/crawl4ai/deep_crawling/init.py
@@ -8,6 +8,7 @@ from .filters import (
    ContentTypeFilter,
    DomainFilter,
    URLFilter,
+    URLPatternFilter,
    FilterStats,
    ContentRelevanceFilter,
    SEOFilter
@@ -32,6 +33,7 @@ __all__ = [
    "ContentTypeFilter",
    "DomainFilter",
    "URLFilter",
+    "URLPatternFilter",
    "FilterStats",
    "ContentRelevanceFilter",
    "SEOFilter",
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from re import U
-from pydantic import BaseModel, HttpUrl
+from pydantic import BaseModel, HttpUrl, PrivateAttr
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
 from enum import Enum
 from dataclasses import dataclass
@@ -86,6 +86,9 @@ class MarkdownGenerationResult(BaseModel):
    fit_markdown: Optional[str] = None
    fit_html: Optional[str] = None

+    def __str__(self):
+        return self.raw_markdown
+
@dataclass
 class TraversalStats:
    """Statistics for the traversal process"""
@@ -105,7 +108,6 @@ class DispatchResult(BaseModel):
    end_time: Union[datetime, float]
    error_message: str = ""

-
 class CrawlResult(BaseModel):
    url: str
    html: str
@@ -117,10 +119,7 @@ class CrawlResult(BaseModel):
    js_execution_result: Optional[Dict[str, Any]] = None
    screenshot: Optional[str] = None
    pdf: Optional[bytes] = None
-    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
-    markdown_v2: Optional[MarkdownGenerationResult] = None
-    fit_markdown: Optional[str] = None
-    fit_html: Optional[str] = None
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
    extracted_content: Optional[str] = None
    metadata: Optional[dict] = None
    error_message: Optional[str] = None
@@ -134,6 +133,118 @@ class CrawlResult(BaseModel):
    class Config:
        arbitrary_types_allowed = True

+# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
+# and model_dump override all exist to support a smooth transition from markdown as a string
+# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
+# 
+# This allows code that expects markdown to be a string to continue working, while also
+# providing access to the full MarkdownGenerationResult object's properties.
+# 
+# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
+# 
+# When backward compatibility is no longer needed in future versions, this entire mechanism
+# can be simplified to a standard field with no custom accessors or serialization logic.
+    
+    def __init__(self, **data):
+        markdown_result = data.pop('markdown', None)
+        super().__init__(**data)
+        if markdown_result is not None:
+            self._markdown = markdown_result
+    
+    @property
+    def markdown(self):
+        """
+        Property that returns a StringCompatibleMarkdown object that behaves like
+        a string but also provides access to MarkdownGenerationResult attributes.
+        
+        This approach allows backward compatibility with code that expects 'markdown'
+        to be a string, while providing access to the full MarkdownGenerationResult.
+        """
+        if self._markdown is None:
+            return None
+        return StringCompatibleMarkdown(self._markdown)
+    
+    @markdown.setter
+    def markdown(self, value):
+        """
+        Setter for the markdown property.
+        """
+        self._markdown = value
+    
+    @property
+    def markdown_v2(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+
+        This property exists to inform users that 'markdown_v2' has been
+        deprecated and they should use 'markdown' instead.
+        """
+        raise AttributeError(
+            "The 'markdown_v2' attribute is deprecated and has been removed. "
+            """Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
+            following properties:
+            - raw_markdown: The raw markdown string
+            - markdown_with_citations: The markdown string with citations
+            - references_markdown: The markdown string with references
+            - fit_markdown: The markdown string with fit text
+            """
+        )
+    
+    @property
+    def fit_markdown(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_markdown' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_markdown' instead."
+        )
+    
+    @property
+    def fit_html(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_html' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_html' instead."
+        )
+
+    def model_dump(self, *args, **kwargs):
+        """
+        Override model_dump to include the _markdown private attribute in serialization.
+        
+        This override is necessary because:
+        1. PrivateAttr fields are excluded from serialization by default
+        2. We need to maintain backward compatibility by including the 'markdown' field
+           in the serialized output
+        3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
+           the same type of data
+        
+        Future developers: This method ensures that the markdown content is properly
+        serialized despite being stored in a private attribute. If the serialization
+        requirements change, this is where you would update the logic.
+        """
+        result = super().model_dump(*args, **kwargs)
+        if self._markdown is not None:
+            result["markdown"] = self._markdown.model_dump() 
+        return result
+
+class StringCompatibleMarkdown(str):
+    """A string subclass that also provides access to MarkdownGenerationResult attributes"""
+    def __new__(cls, markdown_result):
+        return super().__new__(cls, markdown_result.raw_markdown)
+    
+    def __init__(self, markdown_result):
+        self._markdown_result = markdown_result
+    
+    def __getattr__(self, name):
+        return getattr(self._markdown_result, name)
+
+# END of backward compatibility code for markdown/markdown_v2.
+# When removing this code in the future, make sure to:
+# 1. Replace the private attribute and property with a standard field
+# 2. Update any serialization logic that might depend on the current behavior

 class AsyncCrawlResponse(BaseModel):
    html: str
--- a/crawl4ai/processors/pdf/processor.py
+++ b/crawl4ai/processors/pdf/processor.py
@@ -484,4 +484,4 @@ if __name__ == "__main__":
        for page in result.pages:
            f.write(f'# Page {page["page_number"]}\n\n')
            f.write(clean_pdf_text(page["page_number"], page['raw_text']))
-            f.write('\n\n')
+            f.write('\n\n')
--- a/crawl4ai/proxy_strategy.py
+++ b/crawl4ai/proxy_strategy.py
@@ -2,6 +2,8 @@ from typing import List, Dict, Optional
 from abc import ABC, abstractmethod
 from itertools import cycle

+from crawl4ai.configs import ProxyConfig
+
 class ProxyRotationStrategy(ABC):
    """Base abstract class for proxy rotation strategies"""
    
@@ -15,28 +17,27 @@ class ProxyRotationStrategy(ABC):
        """Add proxy configurations to the strategy"""
        pass

-class RoundRobinProxyStrategy(ProxyRotationStrategy):
-    """Simple round-robin proxy rotation strategy"""
+class RoundRobinProxyStrategy:
+    """Simple round-robin proxy rotation strategy using ProxyConfig objects"""

-    def __init__(self, proxies: List[Dict] = None):
+    def __init__(self, proxies: List[ProxyConfig] = None):
        """
        Initialize with optional list of proxy configurations
        
        Args:
-            proxies: List of proxy config dictionaries, each containing at least
-                    'server' key with proxy URL
+            proxies: List of ProxyConfig objects
        """
        self._proxies = []
        self._proxy_cycle = None
        if proxies:
            self.add_proxies(proxies)

-    def add_proxies(self, proxies: List[Dict]):
+    def add_proxies(self, proxies: List[ProxyConfig]):
        """Add new proxies to the rotation pool"""
        self._proxies.extend(proxies)
        self._proxy_cycle = cycle(self._proxies)

-    async def get_next_proxy(self) -> Optional[Dict]:
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
        """Get next proxy in round-robin fashion"""
        if not self._proxy_cycle:
            return None