Merge branch '2025-APR-1' of https://github.com/unclecode/crawl4ai into 2025-APR-1

This commit is contained in:
ntohidi
2025-05-08 11:11:32 +02:00
4 changed files with 50 additions and 14 deletions

View File

@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
# Check flag if we should remove external images
if kwargs.get("exclude_external_images", False):
element.decompose()
return False
# src_url_base = src.split('/')[2]
# url_base = url.split('/')[2]
# if url_base not in src_url_base:
# element.decompose()
# return False
# Handle relative URLs (which are always from the same domain)
if not src.startswith('http') and not src.startswith('//'):
return True # Keep relative URLs
# For absolute URLs, compare the base domains using the existing function
src_base_domain = get_base_domain(src)
url_base_domain = get_base_domain(url)
# If the domains don't match and both are valid, the image is external
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
element.decompose()
return False
# if kwargs.get('exclude_social_media_links', False):
# if image_src_base_domain in exclude_social_media_domains:

View File

@@ -42,6 +42,29 @@ from itertools import chain
from collections import deque
from typing import Generator, Iterable
# Monkey patch to fix wildcard handling in urllib.robotparser
from urllib.robotparser import RuleLine
import re
original_applies_to = RuleLine.applies_to
def patched_applies_to(self, filename):
# Handle wildcards in paths
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
pattern = self.path.replace('%2A', '*')
pattern = re.escape(pattern).replace('\\*', '.*')
pattern = '^' + pattern
if pattern.endswith('\\$'):
pattern = pattern[:-2] + '$'
try:
return bool(re.match(pattern, filename))
except re.error:
return original_applies_to(self, filename)
return original_applies_to(self, filename)
RuleLine.applies_to = patched_applies_to
# Monkey patch ends
def chunk_documents(
documents: Iterable[str],
chunk_token_threshold: int,
@@ -303,7 +326,7 @@ class RobotsParser:
robots_url = f"{scheme}://{domain}/robots.txt"
async with aiohttp.ClientSession() as session:
async with session.get(robots_url, timeout=2) as response:
async with session.get(robots_url, timeout=2, ssl=False) as response:
if response.status == 200:
rules = await response.text()
self._cache_rules(domain, rules)

View File

@@ -403,7 +403,7 @@ async def main():
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
options={"ignore_links": True})
# 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig(
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter
async def main():
@@ -4175,8 +4175,13 @@ async def main():
verbose=True
)
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig(
content_filter=filter
markdown_generator=md_generator
)
async with AsyncWebCrawler() as crawler:

View File

@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter
async def main():
@@ -255,9 +255,12 @@ async def main():
chunk_token_threshold=4096, # Adjust based on your needs
verbose=True
)
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig(
content_filter=filter
markdown_generator=md_generator,
)
async with AsyncWebCrawler() as crawler: