Merge branch '2025-APR-1' of https://github.com/unclecode/crawl4ai into 2025-APR-1

This commit is contained in:
ntohidi
2025-05-08 11:11:32 +02:00
4 changed files with 50 additions and 14 deletions

View File

@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
# Check flag if we should remove external images # Check flag if we should remove external images
if kwargs.get("exclude_external_images", False): if kwargs.get("exclude_external_images", False):
element.decompose() # Handle relative URLs (which are always from the same domain)
return False if not src.startswith('http') and not src.startswith('//'):
# src_url_base = src.split('/')[2] return True # Keep relative URLs
# url_base = url.split('/')[2]
# if url_base not in src_url_base: # For absolute URLs, compare the base domains using the existing function
# element.decompose() src_base_domain = get_base_domain(src)
# return False url_base_domain = get_base_domain(url)
# If the domains don't match and both are valid, the image is external
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
element.decompose()
return False
# if kwargs.get('exclude_social_media_links', False): # if kwargs.get('exclude_social_media_links', False):
# if image_src_base_domain in exclude_social_media_domains: # if image_src_base_domain in exclude_social_media_domains:

View File

@@ -42,6 +42,29 @@ from itertools import chain
from collections import deque from collections import deque
from typing import Generator, Iterable from typing import Generator, Iterable
# Monkey patch to fix wildcard handling in urllib.robotparser
from urllib.robotparser import RuleLine
import re
original_applies_to = RuleLine.applies_to
def patched_applies_to(self, filename):
# Handle wildcards in paths
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
pattern = self.path.replace('%2A', '*')
pattern = re.escape(pattern).replace('\\*', '.*')
pattern = '^' + pattern
if pattern.endswith('\\$'):
pattern = pattern[:-2] + '$'
try:
return bool(re.match(pattern, filename))
except re.error:
return original_applies_to(self, filename)
return original_applies_to(self, filename)
RuleLine.applies_to = patched_applies_to
# Monkey patch ends
def chunk_documents( def chunk_documents(
documents: Iterable[str], documents: Iterable[str],
chunk_token_threshold: int, chunk_token_threshold: int,
@@ -303,7 +326,7 @@ class RobotsParser:
robots_url = f"{scheme}://{domain}/robots.txt" robots_url = f"{scheme}://{domain}/robots.txt"
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(robots_url, timeout=2) as response: async with session.get(robots_url, timeout=2, ssl=False) as response:
if response.status == 200: if response.status == 200:
rules = await response.text() rules = await response.text()
self._cache_rules(domain, rules) self._cache_rules(domain, rules)

View File

@@ -403,7 +403,7 @@ async def main():
md_generator = DefaultMarkdownGenerator( md_generator = DefaultMarkdownGenerator(
content_filter=filter, content_filter=filter,
options={"ignore_links": True} options={"ignore_links": True})
# 4) Crawler run config: skip cache, use extraction # 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig( run_conf = CrawlerRunConfig(
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python ```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def main(): async def main():
@@ -4175,8 +4175,13 @@ async def main():
verbose=True verbose=True
) )
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig( config = CrawlerRunConfig(
content_filter=filter markdown_generator=md_generator
) )
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:

View File

@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python ```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def main(): async def main():
@@ -255,9 +255,12 @@ async def main():
chunk_token_threshold=4096, # Adjust based on your needs chunk_token_threshold=4096, # Adjust based on your needs
verbose=True verbose=True
) )
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig( config = CrawlerRunConfig(
content_filter=filter markdown_generator=md_generator,
) )
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler: