Release prep (#749)
* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from re import U
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
@@ -86,6 +86,9 @@ class MarkdownGenerationResult(BaseModel):
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
return self.raw_markdown
|
||||
|
||||
@dataclass
|
||||
class TraversalStats:
|
||||
"""Statistics for the traversal process"""
|
||||
@@ -105,7 +108,6 @@ class DispatchResult(BaseModel):
|
||||
end_time: Union[datetime, float]
|
||||
error_message: str = ""
|
||||
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
@@ -117,10 +119,7 @@ class CrawlResult(BaseModel):
|
||||
js_execution_result: Optional[Dict[str, Any]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf: Optional[bytes] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
|
||||
extracted_content: Optional[str] = None
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
@@ -134,6 +133,118 @@ class CrawlResult(BaseModel):
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
|
||||
# and model_dump override all exist to support a smooth transition from markdown as a string
|
||||
# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
|
||||
#
|
||||
# This allows code that expects markdown to be a string to continue working, while also
|
||||
# providing access to the full MarkdownGenerationResult object's properties.
|
||||
#
|
||||
# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
|
||||
#
|
||||
# When backward compatibility is no longer needed in future versions, this entire mechanism
|
||||
# can be simplified to a standard field with no custom accessors or serialization logic.
|
||||
|
||||
def __init__(self, **data):
|
||||
markdown_result = data.pop('markdown', None)
|
||||
super().__init__(**data)
|
||||
if markdown_result is not None:
|
||||
self._markdown = markdown_result
|
||||
|
||||
@property
|
||||
def markdown(self):
|
||||
"""
|
||||
Property that returns a StringCompatibleMarkdown object that behaves like
|
||||
a string but also provides access to MarkdownGenerationResult attributes.
|
||||
|
||||
This approach allows backward compatibility with code that expects 'markdown'
|
||||
to be a string, while providing access to the full MarkdownGenerationResult.
|
||||
"""
|
||||
if self._markdown is None:
|
||||
return None
|
||||
return StringCompatibleMarkdown(self._markdown)
|
||||
|
||||
@markdown.setter
|
||||
def markdown(self, value):
|
||||
"""
|
||||
Setter for the markdown property.
|
||||
"""
|
||||
self._markdown = value
|
||||
|
||||
@property
|
||||
def markdown_v2(self):
|
||||
"""
|
||||
Deprecated property that raises an AttributeError when accessed.
|
||||
|
||||
This property exists to inform users that 'markdown_v2' has been
|
||||
deprecated and they should use 'markdown' instead.
|
||||
"""
|
||||
raise AttributeError(
|
||||
"The 'markdown_v2' attribute is deprecated and has been removed. "
|
||||
"""Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
|
||||
following properties:
|
||||
- raw_markdown: The raw markdown string
|
||||
- markdown_with_citations: The markdown string with citations
|
||||
- references_markdown: The markdown string with references
|
||||
- fit_markdown: The markdown string with fit text
|
||||
"""
|
||||
)
|
||||
|
||||
@property
|
||||
def fit_markdown(self):
|
||||
"""
|
||||
Deprecated property that raises an AttributeError when accessed.
|
||||
"""
|
||||
raise AttributeError(
|
||||
"The 'fit_markdown' attribute is deprecated and has been removed. "
|
||||
"Please use 'markdown.fit_markdown' instead."
|
||||
)
|
||||
|
||||
@property
|
||||
def fit_html(self):
|
||||
"""
|
||||
Deprecated property that raises an AttributeError when accessed.
|
||||
"""
|
||||
raise AttributeError(
|
||||
"The 'fit_html' attribute is deprecated and has been removed. "
|
||||
"Please use 'markdown.fit_html' instead."
|
||||
)
|
||||
|
||||
def model_dump(self, *args, **kwargs):
|
||||
"""
|
||||
Override model_dump to include the _markdown private attribute in serialization.
|
||||
|
||||
This override is necessary because:
|
||||
1. PrivateAttr fields are excluded from serialization by default
|
||||
2. We need to maintain backward compatibility by including the 'markdown' field
|
||||
in the serialized output
|
||||
3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
|
||||
the same type of data
|
||||
|
||||
Future developers: This method ensures that the markdown content is properly
|
||||
serialized despite being stored in a private attribute. If the serialization
|
||||
requirements change, this is where you would update the logic.
|
||||
"""
|
||||
result = super().model_dump(*args, **kwargs)
|
||||
if self._markdown is not None:
|
||||
result["markdown"] = self._markdown.model_dump()
|
||||
return result
|
||||
|
||||
class StringCompatibleMarkdown(str):
|
||||
"""A string subclass that also provides access to MarkdownGenerationResult attributes"""
|
||||
def __new__(cls, markdown_result):
|
||||
return super().__new__(cls, markdown_result.raw_markdown)
|
||||
|
||||
def __init__(self, markdown_result):
|
||||
self._markdown_result = markdown_result
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._markdown_result, name)
|
||||
|
||||
# END of backward compatibility code for markdown/markdown_v2.
|
||||
# When removing this code in the future, make sure to:
|
||||
# 1. Replace the private attribute and property with a standard field
|
||||
# 2. Update any serialization logic that might depend on the current behavior
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
|
||||
Reference in New Issue
Block a user