merge next. Resolve conflicts. Fix some import errors and error handling in server.py

2025-04-19 20:27:47 +05:30
parent d2648eaa39 16b2318242
commit b27bb367e8
23 changed files with 5660 additions and 91 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -47,6 +47,7 @@ from .utils import (
    create_box_message,
    get_error_context,
    RobotsParser,
+    preprocess_html_for_schema,
 )


@@ -111,7 +112,8 @@ class AsyncWebCrawler:
        self,
        crawler_strategy: AsyncCrawlerStrategy = None,
        config: BrowserConfig = None,
-        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
+        base_directory: str = str(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
        thread_safe: bool = False,
        logger: AsyncLoggerBase = None,
        **kwargs,
@@ -139,7 +141,8 @@ class AsyncWebCrawler:
        )

        # Initialize crawler strategy
-        params = {k: v for k, v in kwargs.items() if k in ["browser_config", "logger"]}
+        params = {k: v for k, v in kwargs.items() if k in [
+            "browser_config", "logger"]}
        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
            browser_config=browser_config,
            logger=self.logger,
@@ -237,7 +240,8 @@ class AsyncWebCrawler:

        config = config or CrawlerRunConfig()
        if not isinstance(url, str) or not url:
-            raise ValueError("Invalid URL, make sure the URL is a non-empty string")
+            raise ValueError(
+                "Invalid URL, make sure the URL is a non-empty string")

        async with self._lock or self.nullcontext():
            try:
@@ -291,12 +295,12 @@ class AsyncWebCrawler:

                # Update proxy configuration from rotation strategy if available
                if config and config.proxy_rotation_strategy:
-                    next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
+                    next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
                    if next_proxy:
                        self.logger.info(
                            message="Switch proxy: {proxy}",
                            tag="PROXY",
-                            params={"proxy": next_proxy.server} 
+                            params={"proxy": next_proxy.server}
                        )
                        config.proxy_config = next_proxy
                        # config = config.clone(proxy_config=next_proxy)
@@ -306,7 +310,8 @@ class AsyncWebCrawler:
                    t1 = time.perf_counter()

                    if config.user_agent:
-                        self.crawler_strategy.update_user_agent(config.user_agent)
+                        self.crawler_strategy.update_user_agent(
+                            config.user_agent)

                    # Check robots.txt if enabled
                    if config and config.check_robots_txt:
@@ -373,7 +378,8 @@ class AsyncWebCrawler:
                    crawl_result.console_messages = async_response.console_messages

                    crawl_result.success = bool(html)
-                    crawl_result.session_id = getattr(config, "session_id", None)
+                    crawl_result.session_id = getattr(
+                        config, "session_id", None)

                    self.logger.url_status(
                        url=cache_context.display_url,
@@ -396,7 +402,8 @@ class AsyncWebCrawler:
                        tag="COMPLETE"
                    )
                    cached_result.success = bool(html)
-                    cached_result.session_id = getattr(config, "session_id", None)
+                    cached_result.session_id = getattr(
+                        config, "session_id", None)
                    cached_result.redirected_url = cached_result.redirected_url or url
                    return CrawlResultContainer(cached_result)

@@ -463,12 +470,14 @@ class AsyncWebCrawler:
            params = config.__dict__.copy()
            params.pop("url", None)
            # add keys from kwargs to params that doesn't exist in params
-            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
+            params.update({k: v for k, v in kwargs.items()
+                          if k not in params.keys()})

            ################################
            # Scraping Strategy Execution  #
            ################################
-            result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
+            result: ScrapingResult = scraping_strategy.scrap(
+                url, html, **params)

            if result is None:
                raise ValueError(
@@ -484,7 +493,8 @@ class AsyncWebCrawler:

        # Extract results - handle both dict and ScrapingResult
        if isinstance(result, dict):
-            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            cleaned_html = sanitize_input_encode(
+                result.get("cleaned_html", ""))
            media = result.get("media", {})
            links = result.get("links", {})
            metadata = result.get("metadata", {})
@@ -501,14 +511,49 @@ class AsyncWebCrawler:
            config.markdown_generator or DefaultMarkdownGenerator()
        )

+        # --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
+        # Get the desired source from the generator config, default to 'cleaned_html'
+        selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
+
+        # Define the source selection logic using dict dispatch
+        html_source_selector = {
+            "raw_html": lambda: html,  # The original raw HTML
+            "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
+            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+        }
+
+        markdown_input_html = cleaned_html  # Default to cleaned_html
+
+        try:
+            # Get the appropriate lambda function, default to returning cleaned_html if key not found
+            source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
+            # Execute the lambda to get the selected HTML
+            markdown_input_html = source_lambda()
+
+            # Log which source is being used (optional, but helpful for debugging)
+            # if self.logger and verbose:
+            #     actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
+            #     self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
+
+        except Exception as e:
+            # Handle potential errors, especially from preprocess_html_for_schema
+            if self.logger:
+                self.logger.warning(
+                    f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
+                    tag="MARKDOWN_SRC"
+                )
+            # Ensure markdown_input_html is still the default cleaned_html in case of error
+            markdown_input_html = cleaned_html
+        # --- END: HTML SOURCE SELECTION ---
+
        # Uncomment if by default we want to use PruningContentFilter
        # if not config.content_filter and not markdown_generator.content_filter:
        #     markdown_generator.content_filter = PruningContentFilter()

        markdown_result: MarkdownGenerationResult = (
            markdown_generator.generate_markdown(
-                cleaned_html=cleaned_html,
-                base_url=params.get("redirected_url", url),
+                input_html=markdown_input_html,
+                base_url=params.get("redirected_url", url)
                # html2text_options=kwargs.get('html2text', {})
            )
        )
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -31,22 +31,24 @@ class MarkdownGenerationStrategy(ABC):
        content_filter: Optional[RelevantContentFilter] = None,
        options: Optional[Dict[str, Any]] = None,
        verbose: bool = False,
+        content_source: str = "cleaned_html",
    ):
        self.content_filter = content_filter
        self.options = options or {}
        self.verbose = verbose
+        self.content_source = content_source

    @abstractmethod
    def generate_markdown(
        self,
-        cleaned_html: str,
+        input_html: str,
        base_url: str = "",
        html2text_options: Optional[Dict[str, Any]] = None,
        content_filter: Optional[RelevantContentFilter] = None,
        citations: bool = True,
        **kwargs,
    ) -> MarkdownGenerationResult:
-        """Generate markdown from cleaned HTML."""
+        """Generate markdown from the selected input HTML."""
        pass


@@ -63,6 +65,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
    Args:
        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+        content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".

    Returns:
        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
@@ -72,8 +75,9 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
        self,
        content_filter: Optional[RelevantContentFilter] = None,
        options: Optional[Dict[str, Any]] = None,
+        content_source: str = "cleaned_html",
    ):
-        super().__init__(content_filter, options)
+        super().__init__(content_filter, options, verbose=False, content_source=content_source)

    def convert_links_to_citations(
        self, markdown: str, base_url: str = ""
@@ -143,7 +147,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):

    def generate_markdown(
        self,
-        cleaned_html: str,
+        input_html: str,
        base_url: str = "",
        html2text_options: Optional[Dict[str, Any]] = None,
        options: Optional[Dict[str, Any]] = None,
@@ -152,16 +156,16 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
        **kwargs,
    ) -> MarkdownGenerationResult:
        """
-        Generate markdown with citations from cleaned HTML.
+        Generate markdown with citations from the provided input HTML.

        How it works:
-        1. Generate raw markdown from cleaned HTML.
+        1. Generate raw markdown from the input HTML.
        2. Convert links to citations.
        3. Generate fit markdown if content filter is provided.
        4. Return MarkdownGenerationResult.

        Args:
-            cleaned_html (str): Cleaned HTML content.
+            input_html (str): The HTML content to process (selected based on content_source).
            base_url (str): Base URL for URL joins.
            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
@@ -196,14 +200,14 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
            h.update_params(**default_options)

            # Ensure we have valid input
-            if not cleaned_html:
-                cleaned_html = ""
-            elif not isinstance(cleaned_html, str):
-                cleaned_html = str(cleaned_html)
+            if not input_html:
+                input_html = ""
+            elif not isinstance(input_html, str):
+                input_html = str(input_html)

            # Generate raw markdown
            try:
-                raw_markdown = h.handle(cleaned_html)
+                raw_markdown = h.handle(input_html)
            except Exception as e:
                raw_markdown = f"Error converting HTML to markdown: {str(e)}"

@@ -228,7 +232,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
            if content_filter or self.content_filter:
                try:
                    content_filter = content_filter or self.content_filter
-                    filtered_html = content_filter.filter_content(cleaned_html)
+                    filtered_html = content_filter.filter_content(input_html)
                    filtered_html = "\n".join(
                        "<div>{}</div>".format(s) for s in filtered_html
                    )