Merge branch '2025-MAY-2' into next-MAY

This commit is contained in:
ntohidi
2025-07-08 11:46:13 +02:00
28 changed files with 448 additions and 154 deletions

View File

@@ -291,11 +291,19 @@ import requests
# Submit a crawl job # Submit a crawl job
response = requests.post( response = requests.post(
"http://localhost:11235/crawl", "http://localhost:11235/crawl",
json={"urls": "https://example.com", "priority": 10} json={"urls": ["https://example.com"], "priority": 10}
) )
task_id = response.json()["task_id"] if response.status_code == 200:
print("Crawl job submitted successfully.")
# Continue polling until the task is complete (status="completed") if "results" in response.json():
results = response.json()["results"]
print("Crawl job completed. Results:")
for result in results:
print(result)
else:
task_id = response.json()["task_id"]
print(f"Crawl job submitted. Task ID:: {task_id}")
result = requests.get(f"http://localhost:11235/task/{task_id}") result = requests.get(f"http://localhost:11235/task/{task_id}")
``` ```

View File

@@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
return await self._crawl_web(url, config) return await self._crawl_web(url, config)
elif url.startswith("file://"): elif url.startswith("file://"):
# initialize empty lists for console messages
captured_console = []
# Process local file # Process local file
local_file_path = url[7:] # Remove 'file://' prefix local_file_path = url[7:] # Remove 'file://' prefix
if not os.path.exists(local_file_path): if not os.path.exists(local_file_path):
@@ -741,18 +744,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
) )
redirected_url = page.url redirected_url = page.url
except Error as e: except Error as e:
# Allow navigation to be aborted when downloading files
# This is expected behavior for downloads in some browser engines
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
self.logger.info(
message=f"Navigation aborted, likely due to file download: {url}",
tag="GOTO",
params={"url": url},
)
response = None
else:
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
await self.execute_hook( await self.execute_hook(
"after_goto", page, context=context, url=url, response=response, config=config "after_goto", page, context=context, url=url, response=response, config=config
) )
# ──────────────────────────────────────────────────────────────
# Walk the redirect chain. Playwright returns only the last
# hop, so we trace the `request.redirected_from` links until the
# first response that differs from the final one and surface its
# status-code.
# ──────────────────────────────────────────────────────────────
if response is None: if response is None:
status_code = 200 status_code = 200
response_headers = {} response_headers = {}
else: else:
status_code = response.status first_resp = response
response_headers = response.headers req = response.request
while req and req.redirected_from:
prev_req = req.redirected_from
prev_resp = await prev_req.response()
if prev_resp: # keep earliest
first_resp = prev_resp
req = prev_req
status_code = first_resp.status
response_headers = first_resp.headers
# if response is None:
# status_code = 200
# response_headers = {}
# else:
# status_code = response.status
# response_headers = response.headers
else: else:
status_code = 200 status_code = 200
@@ -1616,12 +1650,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
num_segments = (page_height // viewport_height) + 1 num_segments = (page_height // viewport_height) + 1
for i in range(num_segments): for i in range(num_segments):
y_offset = i * viewport_height y_offset = i * viewport_height
# Special handling for the last segment
if i == num_segments - 1:
last_part_height = page_height % viewport_height
# If page_height is an exact multiple of viewport_height,
# we don't need an extra segment
if last_part_height == 0:
# Skip last segment if page height is exact multiple of viewport
break
# Adjust viewport to exactly match the remaining content height
await page.set_viewport_size({"width": page_width, "height": last_part_height})
await page.evaluate(f"window.scrollTo(0, {y_offset})") await page.evaluate(f"window.scrollTo(0, {y_offset})")
await asyncio.sleep(0.01) # wait for render await asyncio.sleep(0.01) # wait for render
seg_shot = await page.screenshot(full_page=False)
# Capture the current segment
# Note: Using compression options (format, quality) would go here
seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
# seg_shot = await page.screenshot(full_page=False)
img = Image.open(BytesIO(seg_shot)).convert("RGB") img = Image.open(BytesIO(seg_shot)).convert("RGB")
segments.append(img) segments.append(img)
# Reset viewport to original size after capturing segments
await page.set_viewport_size({"width": page_width, "height": viewport_height})
total_height = sum(img.height for img in segments) total_height = sum(img.height for img in segments)
stitched = Image.new("RGB", (segments[0].width, total_height)) stitched = Image.new("RGB", (segments[0].width, total_height))
offset = 0 offset = 0

View File

@@ -39,6 +39,7 @@ class LogColor(str, Enum):
YELLOW = "yellow" YELLOW = "yellow"
MAGENTA = "magenta" MAGENTA = "magenta"
DIM_MAGENTA = "dim magenta" DIM_MAGENTA = "dim magenta"
RED = "red"
def __str__(self): def __str__(self):
"""Automatically convert rich color to string.""" """Automatically convert rich color to string."""

View File

@@ -588,10 +588,12 @@ class AsyncWebCrawler:
# Choose content based on input_format # Choose content based on input_format
content_format = config.extraction_strategy.input_format content_format = config.extraction_strategy.input_format
if content_format == "fit_markdown" and not markdown_result.fit_markdown: if content_format == "fit_markdown" and not markdown_result.fit_markdown:
self.logger.warning(
message="Fit markdown requested but not available. Falling back to raw markdown.", self.logger.url_status(
url=_url,
success=bool(html),
timing=time.perf_counter() - t1,
tag="EXTRACT", tag="EXTRACT",
params={"url": _url},
) )
content_format = "markdown" content_format = "markdown"
@@ -616,10 +618,11 @@ class AsyncWebCrawler:
) )
# Log extraction completion # Log extraction completion
self.logger.info( self.logger.url_status(
message="Completed for {url:.50}... | Time: {timing}s", url=_url,
success=bool(html),
timing=time.perf_counter() - t1,
tag="EXTRACT", tag="EXTRACT",
params={"url": _url, "timing": time.perf_counter() - t1},
) )
# Apply HTML formatting if requested # Apply HTML formatting if requested

View File

@@ -480,7 +480,7 @@ class BrowserProfiler:
self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA) self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
exit_option = "4" exit_option = "4"
self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="") self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
choice = input() choice = input()
if choice == "1": if choice == "1":
@@ -637,9 +637,18 @@ class BrowserProfiler:
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP") self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
self.logger.info(f"Headless mode: {headless}", tag="CDP") self.logger.info(f"Headless mode: {headless}", tag="CDP")
# create browser config
browser_config = BrowserConfig(
browser_type=browser_type,
headless=headless,
user_data_dir=profile_path,
debugging_port=debugging_port,
verbose=True
)
# Create managed browser instance # Create managed browser instance
managed_browser = ManagedBrowser( managed_browser = ManagedBrowser(
browser_type=browser_type, browser_config=browser_config,
user_data_dir=profile_path, user_data_dir=profile_path,
headless=headless, headless=headless,
logger=self.logger, logger=self.logger,

View File

@@ -1010,7 +1010,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") @click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)") @click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling") @click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True) @click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)") @click.option("--profile", "-p", help="Use a specific browser profile (by name)")

View File

@@ -720,13 +720,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
# Check flag if we should remove external images # Check flag if we should remove external images
if kwargs.get("exclude_external_images", False): if kwargs.get("exclude_external_images", False):
# Handle relative URLs (which are always from the same domain)
if not src.startswith('http') and not src.startswith('//'):
return True # Keep relative URLs
# For absolute URLs, compare the base domains using the existing function
src_base_domain = get_base_domain(src)
url_base_domain = get_base_domain(url)
# If the domains don't match and both are valid, the image is external
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
element.decompose() element.decompose()
return False return False
# src_url_base = src.split('/')[2]
# url_base = url.split('/')[2]
# if url_base not in src_url_base:
# element.decompose()
# return False
# if kwargs.get('exclude_social_media_links', False): # if kwargs.get('exclude_social_media_links', False):
# if image_src_base_domain in exclude_social_media_domains: # if image_src_base_domain in exclude_social_media_domains:

View File

@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
break break
# Calculate how many more URLs we can process in this batch
remaining = self.max_pages - self._pages_crawled
batch_size = min(BATCH_SIZE, remaining)
if batch_size <= 0:
# No more pages to crawl
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
break
batch: List[Tuple[float, int, str, Optional[str]]] = [] batch: List[Tuple[float, int, str, Optional[str]]] = []
# Retrieve up to BATCH_SIZE items from the priority queue. # Retrieve up to BATCH_SIZE items from the priority queue.
for _ in range(BATCH_SIZE): for _ in range(BATCH_SIZE):
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
# Count only successful crawls toward max_pages limit # Count only successful crawls toward max_pages limit
if result.success: if result.success:
self._pages_crawled += 1 self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator
yield result yield result

View File

@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
results: List[CrawlResult] = [] results: List[CrawlResult] = []
while current_level and not self._cancel_event.is_set(): while current_level and not self._cancel_event.is_set():
# Check if we've already reached max_pages before starting a new level
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
break
next_level: List[Tuple[str, Optional[str]]] = [] next_level: List[Tuple[str, Optional[str]]] = []
urls = [url for url, _ in current_level] urls = [url for url, _ in current_level]
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
# Count only successful crawls # Count only successful crawls
if result.success: if result.success:
self._pages_crawled += 1 self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator
results_count += 1 results_count += 1
yield result yield result

View File

@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
# Count only successful crawls toward max_pages limit # Count only successful crawls toward max_pages limit
if result.success: if result.success:
self._pages_crawled += 1 self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator
# Only discover links from successful crawls # Only discover links from successful crawls
new_links: List[Tuple[str, Optional[str]]] = [] new_links: List[Tuple[str, Optional[str]]] = []
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
# and only discover links from successful crawls # and only discover links from successful crawls
if result.success: if result.success:
self._pages_crawled += 1 self._pages_crawled += 1
# Check if we've reached the limit during batch processing
if self._pages_crawled >= self.max_pages:
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
break # Exit the generator
new_links: List[Tuple[str, Optional[str]]] = [] new_links: List[Tuple[str, Optional[str]]] = []
await self.link_discovery(result, url, depth, visited, new_links, depths) await self.link_discovery(result, url, depth, visited, new_links, depths)

View File

@@ -73,6 +73,8 @@ class Crawl4aiDockerClient:
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]: crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
"""Prepare request data from configs.""" """Prepare request data from configs."""
if self._token:
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
return { return {
"urls": urls, "urls": urls,
"browser_config": browser_config.dump() if browser_config else {}, "browser_config": browser_config.dump() if browser_config else {},
@@ -103,8 +105,6 @@ class Crawl4aiDockerClient:
crawler_config: Optional[CrawlerRunConfig] = None crawler_config: Optional[CrawlerRunConfig] = None
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]: ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
"""Execute a crawl operation.""" """Execute a crawl operation."""
if not self._token:
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
await self._check_server() await self._check_server()
data = self._prepare_request(urls, browser_config, crawler_config) data = self._prepare_request(urls, browser_config, crawler_config)
@@ -140,8 +140,6 @@ class Crawl4aiDockerClient:
async def get_schema(self) -> Dict[str, Any]: async def get_schema(self) -> Dict[str, Any]:
"""Retrieve configuration schemas.""" """Retrieve configuration schemas."""
if not self._token:
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
response = await self._request("GET", "/schema") response = await self._request("GET", "/schema")
return response.json() return response.json()

View File

@@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.total_usage.total_tokens += usage.total_tokens self.total_usage.total_tokens += usage.total_tokens
try: try:
response = response.choices[0].message.content content = response.choices[0].message.content
blocks = None blocks = None
if self.force_json_response: if self.force_json_response:
blocks = json.loads(response) blocks = json.loads(content)
if isinstance(blocks, dict): if isinstance(blocks, dict):
# If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]} # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list): if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
@@ -673,7 +673,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
blocks = blocks blocks = blocks
else: else:
# blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"] # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
blocks = extract_xml_data(["blocks"], response)["blocks"] blocks = extract_xml_data(["blocks"], content)["blocks"]
blocks = json.loads(blocks) blocks = json.loads(blocks)
for block in blocks: for block in blocks:

View File

@@ -50,6 +50,29 @@ from urllib.parse import (
) )
# Monkey patch to fix wildcard handling in urllib.robotparser
from urllib.robotparser import RuleLine
import re
original_applies_to = RuleLine.applies_to
def patched_applies_to(self, filename):
# Handle wildcards in paths
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
pattern = self.path.replace('%2A', '*')
pattern = re.escape(pattern).replace('\\*', '.*')
pattern = '^' + pattern
if pattern.endswith('\\$'):
pattern = pattern[:-2] + '$'
try:
return bool(re.match(pattern, filename))
except re.error:
return original_applies_to(self, filename)
return original_applies_to(self, filename)
RuleLine.applies_to = patched_applies_to
# Monkey patch ends
def chunk_documents( def chunk_documents(
documents: Iterable[str], documents: Iterable[str],
chunk_token_threshold: int, chunk_token_threshold: int,
@@ -318,7 +341,7 @@ class RobotsParser:
robots_url = f"{scheme}://{domain}/robots.txt" robots_url = f"{scheme}://{domain}/robots.txt"
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
async with session.get(robots_url, timeout=2) as response: async with session.get(robots_url, timeout=2, ssl=False) as response:
if response.status == 200: if response.status == 200:
rules = await response.text() rules = await response.text()
self._cache_rules(domain, rules) self._cache_rules(domain, rules)
@@ -1524,6 +1547,13 @@ def extract_metadata_using_lxml(html, doc=None):
content = tag.get("content", "").strip() content = tag.get("content", "").strip()
if property_name and content: if property_name and content:
metadata[property_name] = content metadata[property_name] = content
# Article metadata - using starts-with() for performance
article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
for tag in article_tags:
property_name = tag.get("property", "").strip()
content = tag.get("content", "").strip()
if property_name and content:
metadata[property_name] = content
return metadata return metadata
@@ -1599,7 +1629,12 @@ def extract_metadata(html, soup=None):
content = tag.get("content", "").strip() content = tag.get("content", "").strip()
if property_name and content: if property_name and content:
metadata[property_name] = content metadata[property_name] = content
# getting the article Values
metadata.update({
tag['property'].strip():tag["content"].strip()
for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
if tag.has_attr('property') and tag.has_attr('content')
})
return metadata return metadata
@@ -2069,13 +2104,15 @@ def normalize_url(href, base_url):
if not parsed_base.scheme or not parsed_base.netloc: if not parsed_base.scheme or not parsed_base.netloc:
raise ValueError(f"Invalid base URL format: {base_url}") raise ValueError(f"Invalid base URL format: {base_url}")
# Ensure base_url ends with a trailing slash if it's a directory path if parsed_base.scheme.lower() not in ["http", "https"]:
if not base_url.endswith('/'): # Handle special protocols
base_url = base_url + '/' raise ValueError(f"Invalid base URL format: {base_url}")
cleaned_href = href.strip()
# Use urljoin to handle all cases # Use urljoin to handle all cases
normalized = urljoin(base_url, href.strip()) return urljoin(base_url, cleaned_href)
return normalized
def normalize_url( def normalize_url(

View File

@@ -459,7 +459,7 @@ async def handle_crawl_request(
# await crawler.close() # await crawler.close()
# except Exception as close_e: # except Exception as close_e:
# logger.error(f"Error closing crawler during exception handling: {close_e}") # logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {close_e}") logger.error(f"Error closing crawler during exception handling: {str(e)}")
# Measure memory even on error if possible # Measure memory even on error if possible
end_mem_mb_error = _get_memory_mb() end_mem_mb_error = _get_memory_mb()
@@ -518,7 +518,7 @@ async def handle_stream_crawl_request(
# await crawler.close() # await crawler.close()
# except Exception as close_e: # except Exception as close_e:
# logger.error(f"Error closing crawler during stream setup exception: {close_e}") # logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {close_e}") logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
logger.error(f"Stream crawl error: {str(e)}", exc_info=True) logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
# Raising HTTPException here will prevent streaming response # Raising HTTPException here will prevent streaming response
raise HTTPException( raise HTTPException(

View File

@@ -403,7 +403,7 @@ async def main():
md_generator = DefaultMarkdownGenerator( md_generator = DefaultMarkdownGenerator(
content_filter=filter, content_filter=filter,
options={"ignore_links": True} options={"ignore_links": True})
# 4) Crawler run config: skip cache, use extraction # 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig( run_conf = CrawlerRunConfig(
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_web(): async def crawl_web():
config = CrawlerRunConfig(bypass_cache=True) config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://en.wikipedia.org/wiki/apple", url="https://en.wikipedia.org/wiki/apple",
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_local_file(): async def crawl_local_file():
local_file_path = "/path/to/apple.html" # Replace with your file path local_file_path = "/path/to/apple.html" # Replace with your file path
file_url = f"file://{local_file_path}" file_url = f"file://{local_file_path}"
config = CrawlerRunConfig(bypass_cache=True) config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=file_url, config=config) result = await crawler.arun(url=file_url, config=config)
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_raw_html(): async def crawl_raw_html():
raw_html = "<html><body><h1>Hello, World!</h1></body></html>" raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
raw_html_url = f"raw:{raw_html}" raw_html_url = f"raw:{raw_html}"
config = CrawlerRunConfig(bypass_cache=True) config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=raw_html_url, config=config) result = await crawler.arun(url=raw_html_url, config=config)
@@ -3845,7 +3845,7 @@ import os
import sys import sys
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
async def main(): async def main():
@@ -3856,7 +3856,7 @@ async def main():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
# Step 1: Crawl the Web URL # Step 1: Crawl the Web URL
print("\n=== Step 1: Crawling the Wikipedia URL ===") print("\n=== Step 1: Crawling the Wikipedia URL ===")
web_config = CrawlerRunConfig(bypass_cache=True) web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result = await crawler.arun(url=wikipedia_url, config=web_config) result = await crawler.arun(url=wikipedia_url, config=web_config)
if not result.success: if not result.success:
@@ -3871,7 +3871,7 @@ async def main():
# Step 2: Crawl from the Local HTML File # Step 2: Crawl from the Local HTML File
print("=== Step 2: Crawling from the Local HTML File ===") print("=== Step 2: Crawling from the Local HTML File ===")
file_url = f"file://{html_file_path.resolve()}" file_url = f"file://{html_file_path.resolve()}"
file_config = CrawlerRunConfig(bypass_cache=True) file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
local_result = await crawler.arun(url=file_url, config=file_config) local_result = await crawler.arun(url=file_url, config=file_config)
if not local_result.success: if not local_result.success:
@@ -3887,7 +3887,7 @@ async def main():
with open(html_file_path, 'r', encoding='utf-8') as f: with open(html_file_path, 'r', encoding='utf-8') as f:
raw_html_content = f.read() raw_html_content = f.read()
raw_html_url = f"raw:{raw_html_content}" raw_html_url = f"raw:{raw_html_content}"
raw_config = CrawlerRunConfig(bypass_cache=True) raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
raw_result = await crawler.arun(url=raw_html_url, config=raw_config) raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
if not raw_result.success: if not raw_result.success:
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python ```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def main(): async def main():
@@ -4175,8 +4175,13 @@ async def main():
verbose=True verbose=True
) )
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig( config = CrawlerRunConfig(
content_filter=filter markdown_generator=md_generator
) )
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
```python ```python
import os, asyncio import os, asyncio
from base64 import b64decode from base64 import b64decode
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
async def main(): async def main():
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
pdf=True
)
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions", url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
cache_mode=CacheMode.BYPASS, config=run_config
pdf=True,
screenshot=True
) )
if result.success: if result.success:
# Save screenshot print(f"Screenshot data present: {result.screenshot is not None}")
print(f"PDF data present: {result.pdf is not None}")
if result.screenshot: if result.screenshot:
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
with open("wikipedia_screenshot.png", "wb") as f: with open("wikipedia_screenshot.png", "wb") as f:
f.write(b64decode(result.screenshot)) f.write(b64decode(result.screenshot))
else:
print("[WARN] Screenshot data is None.")
# Save PDF
if result.pdf: if result.pdf:
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
with open("wikipedia_page.pdf", "wb") as f: with open("wikipedia_page.pdf", "wb") as f:
f.write(result.pdf) f.write(result.pdf)
else:
print("[WARN] PDF data is None.")
print("[OK] PDF & screenshot captured.")
else: else:
print("[ERROR]", result.error_message) print("[ERROR]", result.error_message)

View File

@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
class MarkdownRequest(BaseModel): class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint.""" """Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch") url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT, f: FilterType = Field(FilterType.FIT, description="Contentfilter strategy: fit, raw, bm25, or llm")
description="Contentfilter strategy: FIT, RAW, BM25, or LLM")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters") q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter") c: Optional[str] = Field("0", description="Cachebust / revision counter")

View File

@@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
```python ```python
import os, asyncio import os, asyncio
from base64 import b64decode from base64 import b64decode
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
async def main(): async def main():
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
pdf=True
)
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions", url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
cache_mode=CacheMode.BYPASS, config=run_config
pdf=True,
screenshot=True
) )
if result.success: if result.success:
# Save screenshot print(f"Screenshot data present: {result.screenshot is not None}")
print(f"PDF data present: {result.pdf is not None}")
if result.screenshot: if result.screenshot:
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
with open("wikipedia_screenshot.png", "wb") as f: with open("wikipedia_screenshot.png", "wb") as f:
f.write(b64decode(result.screenshot)) f.write(b64decode(result.screenshot))
else:
print("[WARN] Screenshot data is None.")
# Save PDF
if result.pdf: if result.pdf:
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
with open("wikipedia_page.pdf", "wb") as f: with open("wikipedia_page.pdf", "wb") as f:
f.write(result.pdf) f.write(result.pdf)
else:
print("[WARN] PDF data is None.")
print("[OK] PDF & screenshot captured.")
else: else:
print("[ERROR]", result.error_message) print("[ERROR]", result.error_message)

View File

@@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`:
```python ```python
from crawl4ai.async_configs import BrowserConfig from crawl4ai.async_configs import BrowserConfig
proxy_config = { browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass"
}
browser_config = BrowserConfig(proxy_config=proxy_config)
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com") result = await crawler.arun(url="https://example.com")
``` ```
Here's the corrected documentation:
## Rotating Proxies ## Rotating Proxies
Example using a proxy rotation service dynamically: Example using a proxy rotation service dynamically:
```python ```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig import re
from crawl4ai import (
async def get_next_proxy(): AsyncWebCrawler,
# Your proxy rotation logic here BrowserConfig,
return {"server": "http://next.proxy.com:8080"} CrawlerRunConfig,
CacheMode,
RoundRobinProxyStrategy,
)
import asyncio
from crawl4ai import ProxyConfig
async def main(): async def main():
browser_config = BrowserConfig() # Load proxies and create rotation strategy
run_config = CrawlerRunConfig() proxies = ProxyConfig.from_env()
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
if not proxies:
print("No proxies found in environment. Set PROXIES env variable!")
return
proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy
)
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
# For each URL, create a new run config with different proxy urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
for url in urls:
proxy = await get_next_proxy() print("\n📈 Initializing crawler with proxy rotation...")
# Clone the config and update proxy - this creates a new browser context async with AsyncWebCrawler(config=browser_config) as crawler:
current_config = run_config.clone(proxy_config=proxy) print("\n🚀 Starting batch crawl with proxy rotation...")
result = await crawler.arun(url=url, config=current_config) results = await crawler.arun_many(
urls=urls,
config=run_config
)
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match:
print(f"URL {result.url}")
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
verified = ip_match.group(0) == current_proxy.ip
if verified:
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
else:
print("❌ Proxy failed or IP mismatch!")
print("---")
if __name__ == "__main__":
import asyncio
asyncio.run(main()) asyncio.run(main())
``` ```

View File

@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
from crawl4ai import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
@@ -298,7 +298,7 @@ async def main():
# 3) Example LLM content filtering # 3) Example LLM content filtering
gemini_config = LLMConfig( gemini_config = LLMConfig(
provider="gemini/gemini-1.5-pro" provider="gemini/gemini-1.5-pro",
api_token = "env:GEMINI_API_TOKEN" api_token = "env:GEMINI_API_TOKEN"
) )
@@ -324,6 +324,7 @@ async def main():
md_generator = DefaultMarkdownGenerator( md_generator = DefaultMarkdownGenerator(
content_filter=filter, content_filter=filter,
options={"ignore_links": True} options={"ignore_links": True}
)
# 4) Crawler run config: skip cache, use extraction # 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig( run_conf = CrawlerRunConfig(

View File

@@ -17,6 +17,9 @@
- [Configuration Reference](#configuration-reference) - [Configuration Reference](#configuration-reference)
- [Best Practices & Tips](#best-practices--tips) - [Best Practices & Tips](#best-practices--tips)
## Installation
The Crawl4AI CLI will be installed automatically when you install the library.
## Basic Usage ## Basic Usage
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library: The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:

View File

@@ -8,11 +8,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_web(): async def crawl_web():
config = CrawlerRunConfig(bypass_cache=True) config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://en.wikipedia.org/wiki/apple", url="https://en.wikipedia.org/wiki/apple",
@@ -33,13 +33,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_local_file(): async def crawl_local_file():
local_file_path = "/path/to/apple.html" # Replace with your file path local_file_path = "/path/to/apple.html" # Replace with your file path
file_url = f"file://{local_file_path}" file_url = f"file://{local_file_path}"
config = CrawlerRunConfig(bypass_cache=True) config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=file_url, config=config) result = await crawler.arun(url=file_url, config=config)
@@ -93,7 +93,7 @@ import os
import sys import sys
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
async def main(): async def main():
@@ -104,7 +104,7 @@ async def main():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
# Step 1: Crawl the Web URL # Step 1: Crawl the Web URL
print("\n=== Step 1: Crawling the Wikipedia URL ===") print("\n=== Step 1: Crawling the Wikipedia URL ===")
web_config = CrawlerRunConfig(bypass_cache=True) web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result = await crawler.arun(url=wikipedia_url, config=web_config) result = await crawler.arun(url=wikipedia_url, config=web_config)
if not result.success: if not result.success:
@@ -119,7 +119,7 @@ async def main():
# Step 2: Crawl from the Local HTML File # Step 2: Crawl from the Local HTML File
print("=== Step 2: Crawling from the Local HTML File ===") print("=== Step 2: Crawling from the Local HTML File ===")
file_url = f"file://{html_file_path.resolve()}" file_url = f"file://{html_file_path.resolve()}"
file_config = CrawlerRunConfig(bypass_cache=True) file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
local_result = await crawler.arun(url=file_url, config=file_config) local_result = await crawler.arun(url=file_url, config=file_config)
if not local_result.success: if not local_result.success:
@@ -135,7 +135,7 @@ async def main():
with open(html_file_path, 'r', encoding='utf-8') as f: with open(html_file_path, 'r', encoding='utf-8') as f:
raw_html_content = f.read() raw_html_content = f.read()
raw_html_url = f"raw:{raw_html_content}" raw_html_url = f"raw:{raw_html_content}"
raw_config = CrawlerRunConfig(bypass_cache=True) raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
raw_result = await crawler.arun(url=raw_html_url, config=raw_config) raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
if not raw_result.success: if not raw_result.success:

View File

@@ -201,6 +201,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query. - **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.
- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more. - **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.
- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”). - **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
- **`language (str)`**: Language for stemming (default: 'english').
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results. **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
@@ -233,7 +234,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python ```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
async def main(): async def main():
@@ -255,9 +256,12 @@ async def main():
chunk_token_threshold=4096, # Adjust based on your needs chunk_token_threshold=4096, # Adjust based on your needs
verbose=True verbose=True
) )
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig( config = CrawlerRunConfig(
content_filter=filter markdown_generator=md_generator,
) )
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:

View File

@@ -17,7 +17,7 @@ dependencies = [
"lxml~=5.3", "lxml~=5.3",
"litellm>=1.53.1", "litellm>=1.53.1",
"numpy>=1.26.0,<3", "numpy>=1.26.0,<3",
"pillow~=10.4", "pillow>=10.4",
"playwright>=1.49.0", "playwright>=1.49.0",
"python-dotenv~=1.0", "python-dotenv~=1.0",
"requests~=2.26", "requests~=2.26",
@@ -32,7 +32,6 @@ dependencies = [
"psutil>=6.1.1", "psutil>=6.1.1",
"nltk>=3.9.1", "nltk>=3.9.1",
"playwright", "playwright",
"aiofiles",
"rich>=13.9.4", "rich>=13.9.4",
"cssselect>=1.2.0", "cssselect>=1.2.0",
"httpx>=0.27.2", "httpx>=0.27.2",

View File

@@ -4,7 +4,7 @@ aiosqlite~=0.20
lxml~=5.3 lxml~=5.3
litellm>=1.53.1 litellm>=1.53.1
numpy>=1.26.0,<3 numpy>=1.26.0,<3
pillow~=10.4 pillow>=10.4
playwright>=1.49.0 playwright>=1.49.0
python-dotenv~=1.0 python-dotenv~=1.0
requests~=2.26 requests~=2.26
@@ -27,3 +27,7 @@ httpx[http2]>=0.27.2
sentence-transformers>=2.2.0 sentence-transformers>=2.2.0
alphashape>=1.3.1 alphashape>=1.3.1
shapely>=2.0.0 shapely>=2.0.0
fake-useragent>=2.2.0
pdf2image>=1.17.0
PyPDF2>=3.0.1

View File

@@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"):
def test_basic_crawl(tester: Crawl4AiTester): def test_basic_crawl(tester: Crawl4AiTester):
print("\n=== Testing Basic Crawl ===") print("\n=== Testing Basic Crawl ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 10, "priority": 10,
"session_id": "test", "session_id": "test",
} }
@@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
def test_basic_crawl_sync(tester: Crawl4AiTester): def test_basic_crawl_sync(tester: Crawl4AiTester):
print("\n=== Testing Basic Crawl (Sync) ===") print("\n=== Testing Basic Crawl (Sync) ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 10, "priority": 10,
"session_id": "test", "session_id": "test",
} }
@@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
def test_js_execution(tester: Crawl4AiTester): def test_js_execution(tester: Crawl4AiTester):
print("\n=== Testing JS Execution ===") print("\n=== Testing JS Execution ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"js_code": [ "js_code": [
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester):
def test_css_selector(tester: Crawl4AiTester): def test_css_selector(tester: Crawl4AiTester):
print("\n=== Testing CSS Selector ===") print("\n=== Testing CSS Selector ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 7, "priority": 7,
"css_selector": ".wide-tease-item__description", "css_selector": ".wide-tease-item__description",
"crawler_params": {"headless": True}, "crawler_params": {"headless": True},
@@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
} }
request = { request = {
"urls": "https://www.coinbase.com/explore", "urls": ["https://www.coinbase.com/explore"],
"priority": 9, "priority": 9,
"extraction_config": {"type": "json_css", "params": {"schema": schema}}, "extraction_config": {"type": "json_css", "params": {"schema": schema}},
} }
@@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
} }
request = { request = {
"urls": "https://openai.com/api/pricing", "urls": ["https://openai.com/api/pricing"],
"priority": 8, "priority": 8,
"extraction_config": { "extraction_config": {
"type": "llm", "type": "llm",
@@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
} }
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"extraction_config": { "extraction_config": {
"type": "llm", "type": "llm",
@@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
def test_cosine_extraction(tester: Crawl4AiTester): def test_cosine_extraction(tester: Crawl4AiTester):
print("\n=== Testing Cosine Extraction ===") print("\n=== Testing Cosine Extraction ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"extraction_config": { "extraction_config": {
"type": "cosine", "type": "cosine",
@@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
def test_screenshot(tester: Crawl4AiTester): def test_screenshot(tester: Crawl4AiTester):
print("\n=== Testing Screenshot ===") print("\n=== Testing Screenshot ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 5, "priority": 5,
"screenshot": True, "screenshot": True,
"crawler_params": {"headless": True}, "crawler_params": {"headless": True},

View File

@@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"):
def test_basic_crawl(tester: Crawl4AiTester): def test_basic_crawl(tester: Crawl4AiTester):
print("\n=== Testing Basic Crawl ===") print("\n=== Testing Basic Crawl ===")
request = {"urls": "https://www.nbcnews.com/business", "priority": 10} request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
result = tester.submit_and_wait(request) result = tester.submit_and_wait(request)
print(f"Basic crawl result length: {len(result['result']['markdown'])}") print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
def test_js_execution(tester: Crawl4AiTester): def test_js_execution(tester: Crawl4AiTester):
print("\n=== Testing JS Execution ===") print("\n=== Testing JS Execution ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"js_code": [ "js_code": [
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester):
def test_css_selector(tester: Crawl4AiTester): def test_css_selector(tester: Crawl4AiTester):
print("\n=== Testing CSS Selector ===") print("\n=== Testing CSS Selector ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 7, "priority": 7,
"css_selector": ".wide-tease-item__description", "css_selector": ".wide-tease-item__description",
"crawler_params": {"headless": True}, "crawler_params": {"headless": True},
@@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
} }
request = { request = {
"urls": "https://www.coinbase.com/explore", "urls": ["https://www.coinbase.com/explore"],
"priority": 9, "priority": 9,
"extraction_config": {"type": "json_css", "params": {"schema": schema}}, "extraction_config": {"type": "json_css", "params": {"schema": schema}},
} }
@@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
} }
request = { request = {
"urls": "https://openai.com/api/pricing", "urls": ["https://openai.com/api/pricing"],
"priority": 8, "priority": 8,
"extraction_config": { "extraction_config": {
"type": "llm", "type": "llm",
@@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
} }
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"extraction_config": { "extraction_config": {
"type": "llm", "type": "llm",
@@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
def test_cosine_extraction(tester: Crawl4AiTester): def test_cosine_extraction(tester: Crawl4AiTester):
print("\n=== Testing Cosine Extraction ===") print("\n=== Testing Cosine Extraction ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"extraction_config": { "extraction_config": {
"type": "cosine", "type": "cosine",
@@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
def test_screenshot(tester: Crawl4AiTester): def test_screenshot(tester: Crawl4AiTester):
print("\n=== Testing Screenshot ===") print("\n=== Testing Screenshot ===")
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 5, "priority": 5,
"screenshot": True, "screenshot": True,
"crawler_params": {"headless": True}, "crawler_params": {"headless": True},

View File

@@ -54,7 +54,7 @@ class NBCNewsAPITest:
async def test_basic_crawl(): async def test_basic_crawl():
print("\n=== Testing Basic Crawl ===") print("\n=== Testing Basic Crawl ===")
async with NBCNewsAPITest() as api: async with NBCNewsAPITest() as api:
request = {"urls": "https://www.nbcnews.com/business", "priority": 10} request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
task_id = await api.submit_crawl(request) task_id = await api.submit_crawl(request)
result = await api.wait_for_task(task_id) result = await api.wait_for_task(task_id)
print(f"Basic crawl result length: {len(result['result']['markdown'])}") print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -67,7 +67,7 @@ async def test_js_execution():
print("\n=== Testing JS Execution ===") print("\n=== Testing JS Execution ===")
async with NBCNewsAPITest() as api: async with NBCNewsAPITest() as api:
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"js_code": [ "js_code": [
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
@@ -86,7 +86,7 @@ async def test_css_selector():
print("\n=== Testing CSS Selector ===") print("\n=== Testing CSS Selector ===")
async with NBCNewsAPITest() as api: async with NBCNewsAPITest() as api:
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 7, "priority": 7,
"css_selector": ".wide-tease-item__description", "css_selector": ".wide-tease-item__description",
} }
@@ -120,7 +120,7 @@ async def test_structured_extraction():
} }
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 9, "priority": 9,
"extraction_config": {"type": "json_css", "params": {"schema": schema}}, "extraction_config": {"type": "json_css", "params": {"schema": schema}},
} }
@@ -177,7 +177,7 @@ async def test_llm_extraction():
} }
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 8, "priority": 8,
"extraction_config": { "extraction_config": {
"type": "llm", "type": "llm",
@@ -209,7 +209,7 @@ async def test_screenshot():
print("\n=== Testing Screenshot ===") print("\n=== Testing Screenshot ===")
async with NBCNewsAPITest() as api: async with NBCNewsAPITest() as api:
request = { request = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 5, "priority": 5,
"screenshot": True, "screenshot": True,
"crawler_params": {"headless": True}, "crawler_params": {"headless": True},
@@ -227,7 +227,7 @@ async def test_priority_handling():
async with NBCNewsAPITest() as api: async with NBCNewsAPITest() as api:
# Submit low priority task first # Submit low priority task first
low_priority = { low_priority = {
"urls": "https://www.nbcnews.com/business", "urls": ["https://www.nbcnews.com/business"],
"priority": 1, "priority": 1,
"crawler_params": {"headless": True}, "crawler_params": {"headless": True},
} }
@@ -235,7 +235,7 @@ async def test_priority_handling():
# Submit high priority task # Submit high priority task
high_priority = { high_priority = {
"urls": "https://www.nbcnews.com/business/consumer", "urls": ["https://www.nbcnews.com/business/consumer"],
"priority": 10, "priority": 10,
"crawler_params": {"headless": True}, "crawler_params": {"headless": True},
} }

View File

@@ -0,0 +1,91 @@
import unittest
from crawl4ai.utils import normalize_url
class TestNormalizeUrl(unittest.TestCase):
def test_basic_relative_path(self):
self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html")
def test_base_url_with_trailing_slash(self):
self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html")
def test_base_url_without_trailing_slash(self):
# If normalize_url correctly uses urljoin, "base" is treated as a file.
self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html")
def test_absolute_url_as_href(self):
self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html")
def test_href_with_leading_trailing_spaces(self):
self.assertEqual(normalize_url(" page.html ", "http://example.com/"), "http://example.com/page.html")
def test_empty_href(self):
# urljoin with an empty href and base ending in '/' returns the base.
self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/")
# urljoin with an empty href and base not ending in '/' also returns base.
self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base")
def test_href_with_query_parameters(self):
self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test")
def test_href_with_fragment(self):
self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section")
def test_different_scheme_in_href(self):
self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html")
def test_parent_directory_in_href(self):
self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html")
def test_root_relative_href(self):
self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html")
def test_base_url_with_path_and_no_trailing_slash(self):
# If normalize_url correctly uses urljoin, "path" is treated as a file.
self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html")
def test_base_url_is_just_domain(self):
self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html")
def test_href_is_only_query(self):
self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true")
def test_href_is_only_fragment(self):
self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment")
def test_relative_link_from_base_file_url(self):
"""
Tests the specific bug report: relative links from a base URL that is a file.
Example:
Page URL: http://example.com/path/to/document.html
Link on page: <a href="./file.xlsx">
Expected: http://example.com/path/to/file.xlsx
"""
base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml"
href_relative_current_dir = "./P020241203375994691134.xlsx"
expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx"
self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1)
# Test with a relative link that doesn't start with "./"
href_relative_no_dot_slash = "another.doc"
expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc"
self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2)
def test_invalid_base_url_scheme(self):
with self.assertRaises(ValueError) as context:
normalize_url("page.html", "ftp://example.com/")
self.assertIn("Invalid base URL format", str(context.exception))
def test_invalid_base_url_netloc(self):
with self.assertRaises(ValueError) as context:
normalize_url("page.html", "http:///path/")
self.assertIn("Invalid base URL format", str(context.exception))
def test_base_url_with_port(self):
self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html")
def test_href_with_special_characters(self):
self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html")
if __name__ == '__main__':
unittest.main()