Release prep (#749)

* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
parent 3a87b4e43b
commit a9e24307cc
38 changed files with 2040 additions and 326 deletions
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -7,7 +7,7 @@ from contextlib import asynccontextmanager
 import logging
 import json  # Added for serialization/deserialization
 from .utils import ensure_content_dirs, generate_content_hash
-from .models import CrawlResult, MarkdownGenerationResult
+from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
 import aiofiles
 from .utils import VersionManager
 from .async_logger import AsyncLogger
@@ -336,12 +336,17 @@ class AsyncDatabaseManager:
                    except json.JSONDecodeError:
                        # Very UGLY, never mention it to me please
                        if field == "markdown" and isinstance(row_dict[field], str):
-                            row_dict[field] = row_dict[field]
+                            row_dict[field] = MarkdownGenerationResult(
+                                raw_markdown=row_dict[field] or "",
+                                markdown_with_citations="",
+                                references_markdown="",
+                                fit_markdown="",
+                                fit_html="",
+                            )
                        else:
                            row_dict[field] = {}

                if isinstance(row_dict["markdown"], Dict):
-                    row_dict["markdown_v2"] = row_dict["markdown"]
                    if row_dict["markdown"].get("raw_markdown"):
                        row_dict["markdown"] = row_dict["markdown"]["raw_markdown"]

@@ -358,7 +363,7 @@ class AsyncDatabaseManager:
                # Remove any fields not in CrawlResult model
                valid_fields = CrawlResult.__annotations__.keys()
                filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
-
+                filtered_dict["markdown"] = row_dict["markdown"]
                return CrawlResult(**filtered_dict)

        try:
@@ -384,14 +389,14 @@ class AsyncDatabaseManager:
        }

        try:
-            if isinstance(result.markdown, MarkdownGenerationResult):
+            if isinstance(result.markdown, StringCompatibleMarkdown):
                content_map["markdown"] = (
-                    result.markdown.model_dump_json(),
+                    result.markdown,
                    "markdown",
                )
-            elif hasattr(result, "markdown_v2"):
+            elif isinstance(result.markdown, MarkdownGenerationResult):
                content_map["markdown"] = (
-                    result.markdown_v2.model_dump_json(),
+                    result.markdown.model_dump_json(),
                    "markdown",
                )
            elif isinstance(result.markdown, str):