Add support for parallel URL processing in extraction utilities

2026-01-24 04:13:39 +00:00
parent b0b3ca1222
commit 2d5e5306c5
1 changed files with 45 additions and 22 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1349,11 +1349,11 @@ In this scenario, use your best judgment to generate the schema. You need to exa
        llm_config: 'LLMConfig' = create_llm_config(),
        provider: str = None,
        api_token: str = None,
-        url: str = None,
+        url: Union[str, List[str]] = None,
        **kwargs
    ) -> dict:
        """
-        Generate extraction schema from HTML content or URL (sync version).
+        Generate extraction schema from HTML content or URL(s) (sync version).

        Args:
            html (str, optional): The HTML content to analyze. If not provided, url must be set.
@@ -1363,7 +1363,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
            llm_config (LLMConfig): LLM configuration object.
            provider (str): Legacy Parameter. LLM provider to use.
            api_token (str): Legacy Parameter. API token for LLM provider.
-            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
+            url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
+                When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
            **kwargs: Additional args passed to LLM processor.

        Returns:
@@ -1408,11 +1409,11 @@ In this scenario, use your best judgment to generate the schema. You need to exa
        llm_config: 'LLMConfig' = None,
        provider: str = None,
        api_token: str = None,
-        url: str = None,
+        url: Union[str, List[str]] = None,
        **kwargs
    ) -> dict:
        """
-        Generate extraction schema from HTML content or URL (async version).
+        Generate extraction schema from HTML content or URL(s) (async version).

        Use this method when calling from async contexts (e.g., FastAPI) to avoid
        issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
@@ -1426,7 +1427,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
            llm_config (LLMConfig): LLM configuration object.
            provider (str): Legacy Parameter. LLM provider to use.
            api_token (str): Legacy Parameter. API token for LLM provider.
-            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
+            url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
+                When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
            **kwargs: Additional args passed to LLM processor.

        Returns:
@@ -1438,7 +1440,7 @@ In this scenario, use your best judgment to generate the schema. You need to exa
        from .utils import aperform_completion_with_backoff, preprocess_html_for_schema

        # Validate inputs
-        if html is None and url is None:
+        if html is None and (url is None or (isinstance(url, list) and len(url) == 0)):
            raise ValueError("Either 'html' or 'url' must be provided")

        # Check deprecated parameters
@@ -1449,7 +1451,7 @@ In this scenario, use your best judgment to generate the schema. You need to exa
        if llm_config is None:
            llm_config = create_llm_config()

-        # Fetch HTML from URL if provided
+        # Fetch HTML from URL(s) if provided
        if url is not None:
            from .async_webcrawler import AsyncWebCrawler
            from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
@@ -1461,21 +1463,42 @@ In this scenario, use your best judgment to generate the schema. You need to exa
            )
            crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

-            async with AsyncWebCrawler(config=browser_config) as crawler:
-                result = await crawler.arun(url=url, config=crawler_config)
-                if not result.success:
-                    raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
-                if result.status_code >= 400:
-                    raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
-                html = result.html
+            # Normalize to list
+            urls = [url] if isinstance(url, str) else url

-        # Preprocess HTML for schema generation (higher text_threshold for better LLM context)
-        html = preprocess_html_for_schema(
-            html_content=html,
-            text_threshold=2000,
-            attr_value_threshold=500,
-            max_size=500_000
-        )
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                if len(urls) == 1:
+                    result = await crawler.arun(url=urls[0], config=crawler_config)
+                    if not result.success:
+                        raise Exception(f"Failed to fetch URL '{urls[0]}': {result.error_message}")
+                    if result.status_code >= 400:
+                        raise Exception(f"HTTP {result.status_code} error for URL '{urls[0]}'")
+                    html = result.html
+                else:
+                    results = await crawler.arun_many(urls=urls, config=crawler_config)
+                    html_parts = []
+                    for i, result in enumerate(results, 1):
+                        if not result.success:
+                            raise Exception(f"Failed to fetch URL '{result.url}': {result.error_message}")
+                        if result.status_code >= 400:
+                            raise Exception(f"HTTP {result.status_code} error for URL '{result.url}'")
+                        cleaned = preprocess_html_for_schema(
+                            html_content=result.html,
+                            text_threshold=2000,
+                            attr_value_threshold=500,
+                            max_size=500_000
+                        )
+                        html_parts.append(f"'''html example {i}\n{cleaned}\n'''")
+                    html = "\n\n".join(html_parts)
+
+        # Preprocess HTML for schema generation (skip if already preprocessed from multiple URLs)
+        if url is None or isinstance(url, str):
+            html = preprocess_html_for_schema(
+                html_content=html,
+                text_threshold=2000,
+                attr_value_threshold=500,
+                max_size=500_000
+            )

        prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)