Fix: enhance fallback scoring for failed head extraction in LinkPreview. ref #1638

Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638
Merge pull request #1447 from rbushri/fix/wrong_url_raw
2025-11-27 12:14:08 +01:00 · 2025-11-25 11:51:59 +01:00 · 2025-11-25 17:49:44 +08:00 · 2025-11-25 10:46:00 +01:00 · 2025-11-24 13:54:07 +02:00 · 2025-11-17 12:21:23 +01:00
6 changed files with 177 additions and 16 deletions
--- a/5
+++ b/5
@@ -167,6 +167,11 @@ RUN mkdir -p /home/appuser/.cache/ms-playwright \
 RUN crawl4ai-doctor
 # Ensure all cache directories belong to appuser
 # This fixes permission issues with .cache/url_seeder and other runtime cache dirs
 RUN mkdir -p /home/appuser/.cache \
    && chown -R appuser:appuser /home/appuser/.cache
 # Copy application code
 COPY deploy/docker/* ${APP_HOME}/
--- a/crawl4ai/adaptive_crawler.py
+++ b/crawl4ai/adaptive_crawler.py
@@ -728,18 +728,18 @@ class EmbeddingStrategy(CrawlStrategy):
        provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
        api_token = llm_config_dict.get('api_token') if llm_config_dict else None
-        # response = perform_completion_with_backoff(
+        response = perform_completion_with_backoff(
-        #     provider=provider,
+            provider=provider,
-        #     prompt_with_variables=prompt,
+            prompt_with_variables=prompt,
-        #     api_token=api_token,
+            api_token=api_token,
-        #     json_response=True
+            json_response=True
-        # )
+        )
-        # variations = json.loads(response.choices[0].message.content)
+        variations = json.loads(response.choices[0].message.content)
        # # Mock data with more variations for split
-        variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
+        # variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
        # variations = {'queries': [
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -617,11 +617,11 @@ class AsyncWebCrawler:
                else config.chunking_strategy
            )
            sections = chunking.chunk(content)
-            # extracted_content = config.extraction_strategy.run(url, sections)
+            # extracted_content = config.extraction_strategy.run(_url, sections)
            # Use async version if available for better parallelism
            if hasattr(config.extraction_strategy, 'arun'):
-                extracted_content = await config.extraction_strategy.arun(url, sections)
+                extracted_content = await config.extraction_strategy.arun(_url, sections)
            else:
                # Fallback to sync version run in thread pool to avoid blocking
                extracted_content = await asyncio.to_thread(
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
            if el.tag in bypass_tags:
                continue
            # Skip elements inside <pre> or <code> tags where whitespace is significant
            # This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
            is_in_code_block = False
            ancestor = el.getparent()
            while ancestor is not None:
                if ancestor.tag in ("pre", "code"):
                    is_in_code_block = True
                    break
                ancestor = ancestor.getparent()
            if is_in_code_block:
                continue
            text_content = (el.text_content() or "").strip()
            if (
                len(text_content.split()) < word_count_threshold
--- a/crawl4ai/link_preview.py
+++ b/crawl4ai/link_preview.py
@@ -336,8 +336,40 @@ class LinkPreview:
                updated_internal.append(updated_link)
            else:
-                # Keep original link unchanged
+                # # Keep original link unchanged
-                updated_internal.append(link)
+                # updated_internal.append(link)
                # Head extraction failed - calculate fallback scores
                # Use URL-based scoring if query provided
                contextual_score = None
                if config.link_preview_config and config.link_preview_config.query:
                    # Calculate URL-based relevance score as fallback
                    contextual_score = self.seeder._calculate_url_relevance_score(
                        config.link_preview_config.query,
                        link.href
                    )
                # Create updated link with fallback scoring
                updated_link = Link(
                    href=link.href,
                    text=link.text,
                    title=link.title,
                    base_domain=link.base_domain,
                    head_data=None,  # No head data available
                    head_extraction_status="failed",
                    intrinsic_score=getattr(link, 'intrinsic_score', None),
                    contextual_score=contextual_score
                )
                # Calculate total score even without head data
                updated_link.total_score = calculate_total_score(
                    intrinsic_score=updated_link.intrinsic_score,
                    contextual_score=updated_link.contextual_score,
                    score_links_enabled=getattr(config, 'score_links', False),
                    query_provided=bool(config.link_preview_config and config.link_preview_config.query)
                )
                updated_internal.append(updated_link)
        # Update external links
        updated_external = []
@@ -374,8 +406,40 @@ class LinkPreview:
                updated_external.append(updated_link)
            else:
-                # Keep original link unchanged
+                # # Keep original link unchanged
-                updated_external.append(link)
+                # updated_external.append(link)
                # Head extraction failed - calculate fallback scores
                # Use URL-based scoring if query provided
                contextual_score = None
                if config.link_preview_config and config.link_preview_config.query:
                    # Calculate URL-based relevance score as fallback
                    contextual_score = self.seeder._calculate_url_relevance_score(
                        config.link_preview_config.query,
                        link.href
                    )
                # Create updated link with fallback scoring
                updated_link = Link(
                    href=link.href,
                    text=link.text,
                    title=link.title,
                    base_domain=link.base_domain,
                    head_data=None,  # No head data available
                    head_extraction_status="failed",
                    intrinsic_score=getattr(link, 'intrinsic_score', None),
                    contextual_score=contextual_score
                )
                # Calculate total score even without head data
                updated_link.total_score = calculate_total_score(
                    intrinsic_score=updated_link.intrinsic_score,
                    contextual_score=updated_link.contextual_score,
                    score_links_enabled=getattr(config, 'score_links', False),
                    query_provided=bool(config.link_preview_config and config.link_preview_config.query)
                )
                updated_external.append(updated_link)
        # Sort links by relevance score if available
        if any(hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data 
--- a/tests/general/test_async_webcrawler.py
+++ b/tests/general/test_async_webcrawler.py
@@ -9,6 +9,21 @@ from crawl4ai import (
    RateLimiter,
    CacheMode
 )
 from crawl4ai.extraction_strategy import ExtractionStrategy
 class MockExtractionStrategy(ExtractionStrategy):
    """Mock extraction strategy for testing URL parameter handling"""
    def __init__(self):
        super().__init__()
        self.run_calls = []
    def extract(self, url: str, html: str, *args, **kwargs):
        return [{"test": "data"}]
    def run(self, url: str, sections: List[str], *args, **kwargs):
        self.run_calls.append(url)
        return super().run(url, sections, *args, **kwargs)
@pytest.mark.asyncio
@pytest.mark.parametrize("viewport", [
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
        assert not result.success
        assert result.error_message is not None
@pytest.mark.asyncio
 async def test_extraction_strategy_run_with_regular_url():
    """
    Regression test for extraction_strategy.run URL parameter handling with regular URLs.
    This test verifies that when is_raw_html=False (regular URL),
    extraction_strategy.run is called with the actual URL.
    """
    browser_config = BrowserConfig(
        browser_type="chromium",
        headless=True
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        mock_strategy = MockExtractionStrategy()
        # Test regular URL (is_raw_html=False)
        regular_url = "https://example.com"
        result = await crawler.arun(
            url=regular_url,
            config=CrawlerRunConfig(
                page_timeout=30000,
                extraction_strategy=mock_strategy,
                cache_mode=CacheMode.BYPASS
            )
        )
        assert result.success
        assert len(mock_strategy.run_calls) == 1
        assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
@pytest.mark.asyncio
 async def test_extraction_strategy_run_with_raw_html():
    """
    Regression test for extraction_strategy.run URL parameter handling with raw HTML.
    This test verifies that when is_raw_html=True (URL starts with "raw:"),
    extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
    """
    browser_config = BrowserConfig(
        browser_type="chromium",
        headless=True
    )
    async with AsyncWebCrawler(config=browser_config) as crawler:
        mock_strategy = MockExtractionStrategy()
        # Test raw HTML URL (is_raw_html=True automatically set)
        raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
        result = await crawler.arun(
            url=raw_html_url,
            config=CrawlerRunConfig(
                page_timeout=30000,
                extraction_strategy=mock_strategy,
                cache_mode=CacheMode.BYPASS
            )
        )
        assert result.success
        assert len(mock_strategy.run_calls) == 1
        assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
 if __name__ == "__main__":
    asyncio.run(test_viewport_config((1024, 768)))
    asyncio.run(test_memory_management())
    asyncio.run(test_rate_limiting())
    asyncio.run(test_javascript_execution())
    asyncio.run(test_extraction_strategy_run_with_regular_url())
    asyncio.run(test_extraction_strategy_run_with_raw_html())
Author	SHA1	Message	Date
ntohidi	6695a21a41	Fix: enhance fallback scoring for failed head extraction in LinkPreview. ref #1638	2025-11-27 12:14:08 +01:00
ntohidi	b36c6daa5c	Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638	2025-11-25 11:51:59 +01:00
Nasrin	94c8a833bf	Merge pull request #1447 from rbushri/fix/wrong_url_raw Fix: Wrong URL variable used for extraction of raw html	2025-11-25 17:49:44 +08:00
ntohidi	84bfea8bd1	Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621	2025-11-25 10:46:00 +01:00
Rachel Bushrian	7771ed3894	Merge branch 'develop' into fix/wrong_url_raw	2025-11-24 13:54:07 +02:00
ntohidi	c2c4d42be4	Fix #1181 : Preserve whitespace in code blocks during HTML scraping The remove_empty_elements_fast() method was removing whitespace-only span elements inside <pre> and <code> tags, causing import statements like "import torch" to become "importtorch". Now skips elements inside code blocks where whitespace is significant.	2025-11-17 12:21:23 +01:00
rbushria	edd0b576b1	Fix: Use correct URL variable for raw HTML extraction (#1116 ) - Prevents full HTML content from being passed as URL to extraction strategies - Added unit tests to verify raw HTML and regular URL processing Fix: Wrong URL variable used for extraction of raw html	2025-09-01 23:15:56 +03:00