Merge branch '2025-APR-1' of https://github.com/unclecode/crawl4ai into 2025-APR-1

2025-05-08 11:11:32 +02:00
parent ee93acbd06 c1041b9bbe
commit 1af3d1c2e0
4 changed files with 50 additions and 14 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                    # Check flag if we should remove external images
                    if kwargs.get("exclude_external_images", False):
-                        element.decompose()
+                        # Handle relative URLs (which are always from the same domain)
-                        return False
+                        if not src.startswith('http') and not src.startswith('//'):
-                        # src_url_base = src.split('/')[2]
+                            return True  # Keep relative URLs
-                        # url_base = url.split('/')[2]
+                        
-                        # if url_base not in src_url_base:
+                        # For absolute URLs, compare the base domains using the existing function
-                        #     element.decompose()
+                        src_base_domain = get_base_domain(src)
-                        #     return False
+                        url_base_domain = get_base_domain(url)
                        # If the domains don't match and both are valid, the image is external
                        if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
                            element.decompose()
                            return False
                    # if kwargs.get('exclude_social_media_links', False):
                    #     if image_src_base_domain in exclude_social_media_domains:
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -42,6 +42,29 @@ from itertools import chain
 from collections import deque
 from typing import  Generator, Iterable
 # Monkey patch to fix wildcard handling in urllib.robotparser
 from urllib.robotparser import RuleLine
 import re
 original_applies_to = RuleLine.applies_to
 def patched_applies_to(self, filename):
   # Handle wildcards in paths
   if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
       pattern = self.path.replace('%2A', '*')
       pattern = re.escape(pattern).replace('\\*', '.*')
       pattern = '^' + pattern
       if pattern.endswith('\\$'):
           pattern = pattern[:-2] + '$'
       try:
           return bool(re.match(pattern, filename))
       except re.error:
           return original_applies_to(self, filename)
   return original_applies_to(self, filename)
 RuleLine.applies_to = patched_applies_to
 # Monkey patch ends
 def chunk_documents(
    documents: Iterable[str],
    chunk_token_threshold: int,
@@ -303,7 +326,7 @@ class RobotsParser:
                robots_url = f"{scheme}://{domain}/robots.txt"
                async with aiohttp.ClientSession() as session:
-                    async with session.get(robots_url, timeout=2) as response:
+                    async with session.get(robots_url, timeout=2, ssl=False) as response:
                        if response.status == 200:
                            rules = await response.text()
                            self._cache_rules(domain, rules)
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -403,7 +403,7 @@ async def main():
    md_generator = DefaultMarkdownGenerator(
    content_filter=filter,
-    options={"ignore_links": True}
+    options={"ignore_links": True})
    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def main():
@@ -4175,8 +4175,13 @@ async def main():
        verbose=True
    )
    md_generator = DefaultMarkdownGenerator(
        content_filter=filter,
        options={"ignore_links": True}
    )
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator
    )
    async with AsyncWebCrawler() as crawler:
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def main():
@@ -255,9 +255,12 @@ async def main():
        chunk_token_threshold=4096,  # Adjust based on your needs
        verbose=True
    )
-
+    md_generator = DefaultMarkdownGenerator(
        content_filter=filter,
        options={"ignore_links": True}
    )
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator,
    )
    async with AsyncWebCrawler() as crawler: