Release prep (#749)

* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
parent 3a87b4e43b
commit a9e24307cc
38 changed files with 2040 additions and 326 deletions
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from re import U
-from pydantic import BaseModel, HttpUrl
+from pydantic import BaseModel, HttpUrl, PrivateAttr
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
 from enum import Enum
 from dataclasses import dataclass
@@ -86,6 +86,9 @@ class MarkdownGenerationResult(BaseModel):
    fit_markdown: Optional[str] = None
    fit_html: Optional[str] = None

+    def __str__(self):
+        return self.raw_markdown
+
@dataclass
 class TraversalStats:
    """Statistics for the traversal process"""
@@ -105,7 +108,6 @@ class DispatchResult(BaseModel):
    end_time: Union[datetime, float]
    error_message: str = ""

-
 class CrawlResult(BaseModel):
    url: str
    html: str
@@ -117,10 +119,7 @@ class CrawlResult(BaseModel):
    js_execution_result: Optional[Dict[str, Any]] = None
    screenshot: Optional[str] = None
    pdf: Optional[bytes] = None
-    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
-    markdown_v2: Optional[MarkdownGenerationResult] = None
-    fit_markdown: Optional[str] = None
-    fit_html: Optional[str] = None
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
    extracted_content: Optional[str] = None
    metadata: Optional[dict] = None
    error_message: Optional[str] = None
@@ -134,6 +133,118 @@ class CrawlResult(BaseModel):
    class Config:
        arbitrary_types_allowed = True

+# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
+# and model_dump override all exist to support a smooth transition from markdown as a string
+# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
+# 
+# This allows code that expects markdown to be a string to continue working, while also
+# providing access to the full MarkdownGenerationResult object's properties.
+# 
+# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
+# 
+# When backward compatibility is no longer needed in future versions, this entire mechanism
+# can be simplified to a standard field with no custom accessors or serialization logic.
+    
+    def __init__(self, **data):
+        markdown_result = data.pop('markdown', None)
+        super().__init__(**data)
+        if markdown_result is not None:
+            self._markdown = markdown_result
+    
+    @property
+    def markdown(self):
+        """
+        Property that returns a StringCompatibleMarkdown object that behaves like
+        a string but also provides access to MarkdownGenerationResult attributes.
+        
+        This approach allows backward compatibility with code that expects 'markdown'
+        to be a string, while providing access to the full MarkdownGenerationResult.
+        """
+        if self._markdown is None:
+            return None
+        return StringCompatibleMarkdown(self._markdown)
+    
+    @markdown.setter
+    def markdown(self, value):
+        """
+        Setter for the markdown property.
+        """
+        self._markdown = value
+    
+    @property
+    def markdown_v2(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+
+        This property exists to inform users that 'markdown_v2' has been
+        deprecated and they should use 'markdown' instead.
+        """
+        raise AttributeError(
+            "The 'markdown_v2' attribute is deprecated and has been removed. "
+            """Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
+            following properties:
+            - raw_markdown: The raw markdown string
+            - markdown_with_citations: The markdown string with citations
+            - references_markdown: The markdown string with references
+            - fit_markdown: The markdown string with fit text
+            """
+        )
+    
+    @property
+    def fit_markdown(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_markdown' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_markdown' instead."
+        )
+    
+    @property
+    def fit_html(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_html' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_html' instead."
+        )
+
+    def model_dump(self, *args, **kwargs):
+        """
+        Override model_dump to include the _markdown private attribute in serialization.
+        
+        This override is necessary because:
+        1. PrivateAttr fields are excluded from serialization by default
+        2. We need to maintain backward compatibility by including the 'markdown' field
+           in the serialized output
+        3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
+           the same type of data
+        
+        Future developers: This method ensures that the markdown content is properly
+        serialized despite being stored in a private attribute. If the serialization
+        requirements change, this is where you would update the logic.
+        """
+        result = super().model_dump(*args, **kwargs)
+        if self._markdown is not None:
+            result["markdown"] = self._markdown.model_dump() 
+        return result
+
+class StringCompatibleMarkdown(str):
+    """A string subclass that also provides access to MarkdownGenerationResult attributes"""
+    def __new__(cls, markdown_result):
+        return super().__new__(cls, markdown_result.raw_markdown)
+    
+    def __init__(self, markdown_result):
+        self._markdown_result = markdown_result
+    
+    def __getattr__(self, name):
+        return getattr(self._markdown_result, name)
+
+# END of backward compatibility code for markdown/markdown_v2.
+# When removing this code in the future, make sure to:
+# 1. Replace the private attribute and property with a standard field
+# 2. Update any serialization logic that might depend on the current behavior

 class AsyncCrawlResponse(BaseModel):
    html: str