Merge branch '2025-APR-1' of https://github.com/unclecode/crawl4ai into 2025-APR-1
This commit is contained in:
@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get("exclude_external_images", False):
|
||||
element.decompose()
|
||||
return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if url_base not in src_url_base:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# Handle relative URLs (which are always from the same domain)
|
||||
if not src.startswith('http') and not src.startswith('//'):
|
||||
return True # Keep relative URLs
|
||||
|
||||
# For absolute URLs, compare the base domains using the existing function
|
||||
src_base_domain = get_base_domain(src)
|
||||
url_base_domain = get_base_domain(url)
|
||||
|
||||
# If the domains don't match and both are valid, the image is external
|
||||
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# if kwargs.get('exclude_social_media_links', False):
|
||||
# if image_src_base_domain in exclude_social_media_domains:
|
||||
|
||||
@@ -42,6 +42,29 @@ from itertools import chain
|
||||
from collections import deque
|
||||
from typing import Generator, Iterable
|
||||
|
||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||
from urllib.robotparser import RuleLine
|
||||
import re
|
||||
|
||||
original_applies_to = RuleLine.applies_to
|
||||
|
||||
def patched_applies_to(self, filename):
|
||||
# Handle wildcards in paths
|
||||
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
|
||||
pattern = self.path.replace('%2A', '*')
|
||||
pattern = re.escape(pattern).replace('\\*', '.*')
|
||||
pattern = '^' + pattern
|
||||
if pattern.endswith('\\$'):
|
||||
pattern = pattern[:-2] + '$'
|
||||
try:
|
||||
return bool(re.match(pattern, filename))
|
||||
except re.error:
|
||||
return original_applies_to(self, filename)
|
||||
return original_applies_to(self, filename)
|
||||
|
||||
RuleLine.applies_to = patched_applies_to
|
||||
# Monkey patch ends
|
||||
|
||||
def chunk_documents(
|
||||
documents: Iterable[str],
|
||||
chunk_token_threshold: int,
|
||||
@@ -303,7 +326,7 @@ class RobotsParser:
|
||||
robots_url = f"{scheme}://{domain}/robots.txt"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(robots_url, timeout=2) as response:
|
||||
async with session.get(robots_url, timeout=2, ssl=False) as response:
|
||||
if response.status == 200:
|
||||
rules = await response.text()
|
||||
self._cache_rules(domain, rules)
|
||||
|
||||
@@ -403,7 +403,7 @@ async def main():
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
options={"ignore_links": True})
|
||||
|
||||
# 4) Crawler run config: skip cache, use extraction
|
||||
run_conf = CrawlerRunConfig(
|
||||
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
@@ -4175,8 +4175,13 @@ async def main():
|
||||
verbose=True
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=filter
|
||||
markdown_generator=md_generator
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
|
||||
@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
@@ -255,9 +255,12 @@ async def main():
|
||||
chunk_token_threshold=4096, # Adjust based on your needs
|
||||
verbose=True
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=filter
|
||||
markdown_generator=md_generator,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
|
||||
Reference in New Issue
Block a user