Merge branch '2025-APR-1' of https://github.com/unclecode/crawl4ai into 2025-APR-1
This commit is contained in:
@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
# Check flag if we should remove external images
|
# Check flag if we should remove external images
|
||||||
if kwargs.get("exclude_external_images", False):
|
if kwargs.get("exclude_external_images", False):
|
||||||
element.decompose()
|
# Handle relative URLs (which are always from the same domain)
|
||||||
return False
|
if not src.startswith('http') and not src.startswith('//'):
|
||||||
# src_url_base = src.split('/')[2]
|
return True # Keep relative URLs
|
||||||
# url_base = url.split('/')[2]
|
|
||||||
# if url_base not in src_url_base:
|
# For absolute URLs, compare the base domains using the existing function
|
||||||
# element.decompose()
|
src_base_domain = get_base_domain(src)
|
||||||
# return False
|
url_base_domain = get_base_domain(url)
|
||||||
|
|
||||||
|
# If the domains don't match and both are valid, the image is external
|
||||||
|
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
# if kwargs.get('exclude_social_media_links', False):
|
# if kwargs.get('exclude_social_media_links', False):
|
||||||
# if image_src_base_domain in exclude_social_media_domains:
|
# if image_src_base_domain in exclude_social_media_domains:
|
||||||
|
|||||||
@@ -42,6 +42,29 @@ from itertools import chain
|
|||||||
from collections import deque
|
from collections import deque
|
||||||
from typing import Generator, Iterable
|
from typing import Generator, Iterable
|
||||||
|
|
||||||
|
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||||
|
from urllib.robotparser import RuleLine
|
||||||
|
import re
|
||||||
|
|
||||||
|
original_applies_to = RuleLine.applies_to
|
||||||
|
|
||||||
|
def patched_applies_to(self, filename):
|
||||||
|
# Handle wildcards in paths
|
||||||
|
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
|
||||||
|
pattern = self.path.replace('%2A', '*')
|
||||||
|
pattern = re.escape(pattern).replace('\\*', '.*')
|
||||||
|
pattern = '^' + pattern
|
||||||
|
if pattern.endswith('\\$'):
|
||||||
|
pattern = pattern[:-2] + '$'
|
||||||
|
try:
|
||||||
|
return bool(re.match(pattern, filename))
|
||||||
|
except re.error:
|
||||||
|
return original_applies_to(self, filename)
|
||||||
|
return original_applies_to(self, filename)
|
||||||
|
|
||||||
|
RuleLine.applies_to = patched_applies_to
|
||||||
|
# Monkey patch ends
|
||||||
|
|
||||||
def chunk_documents(
|
def chunk_documents(
|
||||||
documents: Iterable[str],
|
documents: Iterable[str],
|
||||||
chunk_token_threshold: int,
|
chunk_token_threshold: int,
|
||||||
@@ -303,7 +326,7 @@ class RobotsParser:
|
|||||||
robots_url = f"{scheme}://{domain}/robots.txt"
|
robots_url = f"{scheme}://{domain}/robots.txt"
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(robots_url, timeout=2) as response:
|
async with session.get(robots_url, timeout=2, ssl=False) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
rules = await response.text()
|
rules = await response.text()
|
||||||
self._cache_rules(domain, rules)
|
self._cache_rules(domain, rules)
|
||||||
|
|||||||
@@ -403,7 +403,7 @@ async def main():
|
|||||||
|
|
||||||
md_generator = DefaultMarkdownGenerator(
|
md_generator = DefaultMarkdownGenerator(
|
||||||
content_filter=filter,
|
content_filter=filter,
|
||||||
options={"ignore_links": True}
|
options={"ignore_links": True})
|
||||||
|
|
||||||
# 4) Crawler run config: skip cache, use extraction
|
# 4) Crawler run config: skip cache, use extraction
|
||||||
run_conf = CrawlerRunConfig(
|
run_conf = CrawlerRunConfig(
|
||||||
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
|
|||||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -4175,8 +4175,13 @@ async def main():
|
|||||||
verbose=True
|
verbose=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=filter,
|
||||||
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
content_filter=filter
|
markdown_generator=md_generator
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
|||||||
@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
|
|||||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -255,9 +255,12 @@ async def main():
|
|||||||
chunk_token_threshold=4096, # Adjust based on your needs
|
chunk_token_threshold=4096, # Adjust based on your needs
|
||||||
verbose=True
|
verbose=True
|
||||||
)
|
)
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=filter,
|
||||||
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
content_filter=filter
|
markdown_generator=md_generator,
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
|||||||
Reference in New Issue
Block a user