Merge branch '2025-MAY-2' into next-MAY
This commit is contained in:
16
README.md
16
README.md
@@ -291,12 +291,20 @@ import requests
|
|||||||
# Submit a crawl job
|
# Submit a crawl job
|
||||||
response = requests.post(
|
response = requests.post(
|
||||||
"http://localhost:11235/crawl",
|
"http://localhost:11235/crawl",
|
||||||
json={"urls": "https://example.com", "priority": 10}
|
json={"urls": ["https://example.com"], "priority": 10}
|
||||||
)
|
)
|
||||||
task_id = response.json()["task_id"]
|
if response.status_code == 200:
|
||||||
|
print("Crawl job submitted successfully.")
|
||||||
|
|
||||||
# Continue polling until the task is complete (status="completed")
|
if "results" in response.json():
|
||||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
results = response.json()["results"]
|
||||||
|
print("Crawl job completed. Results:")
|
||||||
|
for result in results:
|
||||||
|
print(result)
|
||||||
|
else:
|
||||||
|
task_id = response.json()["task_id"]
|
||||||
|
print(f"Crawl job submitted. Task ID:: {task_id}")
|
||||||
|
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||||
```
|
```
|
||||||
|
|
||||||
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/).
|
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/).
|
||||||
|
|||||||
@@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
return await self._crawl_web(url, config)
|
return await self._crawl_web(url, config)
|
||||||
|
|
||||||
elif url.startswith("file://"):
|
elif url.startswith("file://"):
|
||||||
|
# initialize empty lists for console messages
|
||||||
|
captured_console = []
|
||||||
|
|
||||||
# Process local file
|
# Process local file
|
||||||
local_file_path = url[7:] # Remove 'file://' prefix
|
local_file_path = url[7:] # Remove 'file://' prefix
|
||||||
if not os.path.exists(local_file_path):
|
if not os.path.exists(local_file_path):
|
||||||
@@ -741,18 +744,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
)
|
)
|
||||||
redirected_url = page.url
|
redirected_url = page.url
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
# Allow navigation to be aborted when downloading files
|
||||||
|
# This is expected behavior for downloads in some browser engines
|
||||||
|
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
|
||||||
|
self.logger.info(
|
||||||
|
message=f"Navigation aborted, likely due to file download: {url}",
|
||||||
|
tag="GOTO",
|
||||||
|
params={"url": url},
|
||||||
|
)
|
||||||
|
response = None
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||||
|
|
||||||
await self.execute_hook(
|
await self.execute_hook(
|
||||||
"after_goto", page, context=context, url=url, response=response, config=config
|
"after_goto", page, context=context, url=url, response=response, config=config
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────
|
||||||
|
# Walk the redirect chain. Playwright returns only the last
|
||||||
|
# hop, so we trace the `request.redirected_from` links until the
|
||||||
|
# first response that differs from the final one and surface its
|
||||||
|
# status-code.
|
||||||
|
# ──────────────────────────────────────────────────────────────
|
||||||
if response is None:
|
if response is None:
|
||||||
status_code = 200
|
status_code = 200
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
else:
|
else:
|
||||||
status_code = response.status
|
first_resp = response
|
||||||
response_headers = response.headers
|
req = response.request
|
||||||
|
while req and req.redirected_from:
|
||||||
|
prev_req = req.redirected_from
|
||||||
|
prev_resp = await prev_req.response()
|
||||||
|
if prev_resp: # keep earliest
|
||||||
|
first_resp = prev_resp
|
||||||
|
req = prev_req
|
||||||
|
|
||||||
|
status_code = first_resp.status
|
||||||
|
response_headers = first_resp.headers
|
||||||
|
# if response is None:
|
||||||
|
# status_code = 200
|
||||||
|
# response_headers = {}
|
||||||
|
# else:
|
||||||
|
# status_code = response.status
|
||||||
|
# response_headers = response.headers
|
||||||
|
|
||||||
else:
|
else:
|
||||||
status_code = 200
|
status_code = 200
|
||||||
@@ -1616,12 +1650,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
num_segments = (page_height // viewport_height) + 1
|
num_segments = (page_height // viewport_height) + 1
|
||||||
for i in range(num_segments):
|
for i in range(num_segments):
|
||||||
y_offset = i * viewport_height
|
y_offset = i * viewport_height
|
||||||
|
# Special handling for the last segment
|
||||||
|
if i == num_segments - 1:
|
||||||
|
last_part_height = page_height % viewport_height
|
||||||
|
|
||||||
|
# If page_height is an exact multiple of viewport_height,
|
||||||
|
# we don't need an extra segment
|
||||||
|
if last_part_height == 0:
|
||||||
|
# Skip last segment if page height is exact multiple of viewport
|
||||||
|
break
|
||||||
|
|
||||||
|
# Adjust viewport to exactly match the remaining content height
|
||||||
|
await page.set_viewport_size({"width": page_width, "height": last_part_height})
|
||||||
|
|
||||||
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||||
await asyncio.sleep(0.01) # wait for render
|
await asyncio.sleep(0.01) # wait for render
|
||||||
seg_shot = await page.screenshot(full_page=False)
|
|
||||||
|
# Capture the current segment
|
||||||
|
# Note: Using compression options (format, quality) would go here
|
||||||
|
seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
|
||||||
|
# seg_shot = await page.screenshot(full_page=False)
|
||||||
img = Image.open(BytesIO(seg_shot)).convert("RGB")
|
img = Image.open(BytesIO(seg_shot)).convert("RGB")
|
||||||
segments.append(img)
|
segments.append(img)
|
||||||
|
|
||||||
|
# Reset viewport to original size after capturing segments
|
||||||
|
await page.set_viewport_size({"width": page_width, "height": viewport_height})
|
||||||
|
|
||||||
total_height = sum(img.height for img in segments)
|
total_height = sum(img.height for img in segments)
|
||||||
stitched = Image.new("RGB", (segments[0].width, total_height))
|
stitched = Image.new("RGB", (segments[0].width, total_height))
|
||||||
offset = 0
|
offset = 0
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ class LogColor(str, Enum):
|
|||||||
YELLOW = "yellow"
|
YELLOW = "yellow"
|
||||||
MAGENTA = "magenta"
|
MAGENTA = "magenta"
|
||||||
DIM_MAGENTA = "dim magenta"
|
DIM_MAGENTA = "dim magenta"
|
||||||
|
RED = "red"
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
"""Automatically convert rich color to string."""
|
"""Automatically convert rich color to string."""
|
||||||
|
|||||||
@@ -588,11 +588,13 @@ class AsyncWebCrawler:
|
|||||||
# Choose content based on input_format
|
# Choose content based on input_format
|
||||||
content_format = config.extraction_strategy.input_format
|
content_format = config.extraction_strategy.input_format
|
||||||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||||||
self.logger.warning(
|
|
||||||
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
self.logger.url_status(
|
||||||
tag="EXTRACT",
|
url=_url,
|
||||||
params={"url": _url},
|
success=bool(html),
|
||||||
)
|
timing=time.perf_counter() - t1,
|
||||||
|
tag="EXTRACT",
|
||||||
|
)
|
||||||
content_format = "markdown"
|
content_format = "markdown"
|
||||||
|
|
||||||
content = {
|
content = {
|
||||||
@@ -616,11 +618,12 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Log extraction completion
|
# Log extraction completion
|
||||||
self.logger.info(
|
self.logger.url_status(
|
||||||
message="Completed for {url:.50}... | Time: {timing}s",
|
url=_url,
|
||||||
tag="EXTRACT",
|
success=bool(html),
|
||||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
timing=time.perf_counter() - t1,
|
||||||
)
|
tag="EXTRACT",
|
||||||
|
)
|
||||||
|
|
||||||
# Apply HTML formatting if requested
|
# Apply HTML formatting if requested
|
||||||
if config.prettiify:
|
if config.prettiify:
|
||||||
|
|||||||
@@ -480,7 +480,7 @@ class BrowserProfiler:
|
|||||||
self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
|
self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
|
||||||
exit_option = "4"
|
exit_option = "4"
|
||||||
|
|
||||||
self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
|
self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
|
||||||
choice = input()
|
choice = input()
|
||||||
|
|
||||||
if choice == "1":
|
if choice == "1":
|
||||||
@@ -637,9 +637,18 @@ class BrowserProfiler:
|
|||||||
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
|
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
|
||||||
self.logger.info(f"Headless mode: {headless}", tag="CDP")
|
self.logger.info(f"Headless mode: {headless}", tag="CDP")
|
||||||
|
|
||||||
|
# create browser config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type=browser_type,
|
||||||
|
headless=headless,
|
||||||
|
user_data_dir=profile_path,
|
||||||
|
debugging_port=debugging_port,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
# Create managed browser instance
|
# Create managed browser instance
|
||||||
managed_browser = ManagedBrowser(
|
managed_browser = ManagedBrowser(
|
||||||
browser_type=browser_type,
|
browser_config=browser_config,
|
||||||
user_data_dir=profile_path,
|
user_data_dir=profile_path,
|
||||||
headless=headless,
|
headless=headless,
|
||||||
logger=self.logger,
|
logger=self.logger,
|
||||||
|
|||||||
@@ -1010,7 +1010,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
|
|||||||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||||||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
||||||
@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
|
@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
|
||||||
@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
|
@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling")
|
||||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||||
@click.option("--verbose", "-v", is_flag=True)
|
@click.option("--verbose", "-v", is_flag=True)
|
||||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||||
|
|||||||
@@ -720,13 +720,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
# Check flag if we should remove external images
|
# Check flag if we should remove external images
|
||||||
if kwargs.get("exclude_external_images", False):
|
if kwargs.get("exclude_external_images", False):
|
||||||
element.decompose()
|
# Handle relative URLs (which are always from the same domain)
|
||||||
return False
|
if not src.startswith('http') and not src.startswith('//'):
|
||||||
# src_url_base = src.split('/')[2]
|
return True # Keep relative URLs
|
||||||
# url_base = url.split('/')[2]
|
|
||||||
# if url_base not in src_url_base:
|
# For absolute URLs, compare the base domains using the existing function
|
||||||
# element.decompose()
|
src_base_domain = get_base_domain(src)
|
||||||
# return False
|
url_base_domain = get_base_domain(url)
|
||||||
|
|
||||||
|
# If the domains don't match and both are valid, the image is external
|
||||||
|
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
# if kwargs.get('exclude_social_media_links', False):
|
# if kwargs.get('exclude_social_media_links', False):
|
||||||
# if image_src_base_domain in exclude_social_media_domains:
|
# if image_src_base_domain in exclude_social_media_domains:
|
||||||
|
|||||||
@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Calculate how many more URLs we can process in this batch
|
||||||
|
remaining = self.max_pages - self._pages_crawled
|
||||||
|
batch_size = min(BATCH_SIZE, remaining)
|
||||||
|
if batch_size <= 0:
|
||||||
|
# No more pages to crawl
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
|
break
|
||||||
|
|
||||||
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
||||||
# Retrieve up to BATCH_SIZE items from the priority queue.
|
# Retrieve up to BATCH_SIZE items from the priority queue.
|
||||||
for _ in range(BATCH_SIZE):
|
for _ in range(BATCH_SIZE):
|
||||||
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|||||||
@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
results: List[CrawlResult] = []
|
results: List[CrawlResult] = []
|
||||||
|
|
||||||
while current_level and not self._cancel_event.is_set():
|
while current_level and not self._cancel_event.is_set():
|
||||||
|
# Check if we've already reached max_pages before starting a new level
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
|
break
|
||||||
|
|
||||||
next_level: List[Tuple[str, Optional[str]]] = []
|
next_level: List[Tuple[str, Optional[str]]] = []
|
||||||
urls = [url for url, _ in current_level]
|
urls = [url for url, _ in current_level]
|
||||||
|
|
||||||
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
# Count only successful crawls
|
# Count only successful crawls
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
results_count += 1
|
results_count += 1
|
||||||
yield result
|
yield result
|
||||||
|
|||||||
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
# Only discover links from successful crawls
|
# Only discover links from successful crawls
|
||||||
new_links: List[Tuple[str, Optional[str]]] = []
|
new_links: List[Tuple[str, Optional[str]]] = []
|
||||||
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
# and only discover links from successful crawls
|
# and only discover links from successful crawls
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
new_links: List[Tuple[str, Optional[str]]] = []
|
new_links: List[Tuple[str, Optional[str]]] = []
|
||||||
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
||||||
|
|||||||
@@ -73,6 +73,8 @@ class Crawl4aiDockerClient:
|
|||||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
||||||
"""Prepare request data from configs."""
|
"""Prepare request data from configs."""
|
||||||
|
if self._token:
|
||||||
|
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||||
return {
|
return {
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
"browser_config": browser_config.dump() if browser_config else {},
|
"browser_config": browser_config.dump() if browser_config else {},
|
||||||
@@ -103,8 +105,6 @@ class Crawl4aiDockerClient:
|
|||||||
crawler_config: Optional[CrawlerRunConfig] = None
|
crawler_config: Optional[CrawlerRunConfig] = None
|
||||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||||
"""Execute a crawl operation."""
|
"""Execute a crawl operation."""
|
||||||
if not self._token:
|
|
||||||
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
|
|
||||||
await self._check_server()
|
await self._check_server()
|
||||||
|
|
||||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
data = self._prepare_request(urls, browser_config, crawler_config)
|
||||||
@@ -140,8 +140,6 @@ class Crawl4aiDockerClient:
|
|||||||
|
|
||||||
async def get_schema(self) -> Dict[str, Any]:
|
async def get_schema(self) -> Dict[str, Any]:
|
||||||
"""Retrieve configuration schemas."""
|
"""Retrieve configuration schemas."""
|
||||||
if not self._token:
|
|
||||||
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
|
|
||||||
response = await self._request("GET", "/schema")
|
response = await self._request("GET", "/schema")
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
|
|||||||
@@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
self.total_usage.total_tokens += usage.total_tokens
|
self.total_usage.total_tokens += usage.total_tokens
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = response.choices[0].message.content
|
content = response.choices[0].message.content
|
||||||
blocks = None
|
blocks = None
|
||||||
|
|
||||||
if self.force_json_response:
|
if self.force_json_response:
|
||||||
blocks = json.loads(response)
|
blocks = json.loads(content)
|
||||||
if isinstance(blocks, dict):
|
if isinstance(blocks, dict):
|
||||||
# If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
|
# If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
|
||||||
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
|
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
|
||||||
@@ -673,7 +673,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
blocks = blocks
|
blocks = blocks
|
||||||
else:
|
else:
|
||||||
# blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
|
# blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
|
||||||
blocks = extract_xml_data(["blocks"], response)["blocks"]
|
blocks = extract_xml_data(["blocks"], content)["blocks"]
|
||||||
blocks = json.loads(blocks)
|
blocks = json.loads(blocks)
|
||||||
|
|
||||||
for block in blocks:
|
for block in blocks:
|
||||||
|
|||||||
@@ -50,6 +50,29 @@ from urllib.parse import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||||
|
from urllib.robotparser import RuleLine
|
||||||
|
import re
|
||||||
|
|
||||||
|
original_applies_to = RuleLine.applies_to
|
||||||
|
|
||||||
|
def patched_applies_to(self, filename):
|
||||||
|
# Handle wildcards in paths
|
||||||
|
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
|
||||||
|
pattern = self.path.replace('%2A', '*')
|
||||||
|
pattern = re.escape(pattern).replace('\\*', '.*')
|
||||||
|
pattern = '^' + pattern
|
||||||
|
if pattern.endswith('\\$'):
|
||||||
|
pattern = pattern[:-2] + '$'
|
||||||
|
try:
|
||||||
|
return bool(re.match(pattern, filename))
|
||||||
|
except re.error:
|
||||||
|
return original_applies_to(self, filename)
|
||||||
|
return original_applies_to(self, filename)
|
||||||
|
|
||||||
|
RuleLine.applies_to = patched_applies_to
|
||||||
|
# Monkey patch ends
|
||||||
|
|
||||||
def chunk_documents(
|
def chunk_documents(
|
||||||
documents: Iterable[str],
|
documents: Iterable[str],
|
||||||
chunk_token_threshold: int,
|
chunk_token_threshold: int,
|
||||||
@@ -318,7 +341,7 @@ class RobotsParser:
|
|||||||
robots_url = f"{scheme}://{domain}/robots.txt"
|
robots_url = f"{scheme}://{domain}/robots.txt"
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(robots_url, timeout=2) as response:
|
async with session.get(robots_url, timeout=2, ssl=False) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
rules = await response.text()
|
rules = await response.text()
|
||||||
self._cache_rules(domain, rules)
|
self._cache_rules(domain, rules)
|
||||||
@@ -1524,6 +1547,13 @@ def extract_metadata_using_lxml(html, doc=None):
|
|||||||
content = tag.get("content", "").strip()
|
content = tag.get("content", "").strip()
|
||||||
if property_name and content:
|
if property_name and content:
|
||||||
metadata[property_name] = content
|
metadata[property_name] = content
|
||||||
|
# Article metadata - using starts-with() for performance
|
||||||
|
article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
|
||||||
|
for tag in article_tags:
|
||||||
|
property_name = tag.get("property", "").strip()
|
||||||
|
content = tag.get("content", "").strip()
|
||||||
|
if property_name and content:
|
||||||
|
metadata[property_name] = content
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
@@ -1599,7 +1629,12 @@ def extract_metadata(html, soup=None):
|
|||||||
content = tag.get("content", "").strip()
|
content = tag.get("content", "").strip()
|
||||||
if property_name and content:
|
if property_name and content:
|
||||||
metadata[property_name] = content
|
metadata[property_name] = content
|
||||||
|
# getting the article Values
|
||||||
|
metadata.update({
|
||||||
|
tag['property'].strip():tag["content"].strip()
|
||||||
|
for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
|
||||||
|
if tag.has_attr('property') and tag.has_attr('content')
|
||||||
|
})
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
@@ -2069,13 +2104,15 @@ def normalize_url(href, base_url):
|
|||||||
if not parsed_base.scheme or not parsed_base.netloc:
|
if not parsed_base.scheme or not parsed_base.netloc:
|
||||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||||
|
|
||||||
# Ensure base_url ends with a trailing slash if it's a directory path
|
if parsed_base.scheme.lower() not in ["http", "https"]:
|
||||||
if not base_url.endswith('/'):
|
# Handle special protocols
|
||||||
base_url = base_url + '/'
|
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||||
|
cleaned_href = href.strip()
|
||||||
|
|
||||||
# Use urljoin to handle all cases
|
# Use urljoin to handle all cases
|
||||||
normalized = urljoin(base_url, href.strip())
|
return urljoin(base_url, cleaned_href)
|
||||||
return normalized
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_url(
|
def normalize_url(
|
||||||
|
|||||||
@@ -459,7 +459,7 @@ async def handle_crawl_request(
|
|||||||
# await crawler.close()
|
# await crawler.close()
|
||||||
# except Exception as close_e:
|
# except Exception as close_e:
|
||||||
# logger.error(f"Error closing crawler during exception handling: {close_e}")
|
# logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||||
logger.error(f"Error closing crawler during exception handling: {close_e}")
|
logger.error(f"Error closing crawler during exception handling: {str(e)}")
|
||||||
|
|
||||||
# Measure memory even on error if possible
|
# Measure memory even on error if possible
|
||||||
end_mem_mb_error = _get_memory_mb()
|
end_mem_mb_error = _get_memory_mb()
|
||||||
@@ -518,7 +518,7 @@ async def handle_stream_crawl_request(
|
|||||||
# await crawler.close()
|
# await crawler.close()
|
||||||
# except Exception as close_e:
|
# except Exception as close_e:
|
||||||
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||||
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
|
||||||
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
|
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
|
||||||
# Raising HTTPException here will prevent streaming response
|
# Raising HTTPException here will prevent streaming response
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
|
|||||||
@@ -403,7 +403,7 @@ async def main():
|
|||||||
|
|
||||||
md_generator = DefaultMarkdownGenerator(
|
md_generator = DefaultMarkdownGenerator(
|
||||||
content_filter=filter,
|
content_filter=filter,
|
||||||
options={"ignore_links": True}
|
options={"ignore_links": True})
|
||||||
|
|
||||||
# 4) Crawler run config: skip cache, use extraction
|
# 4) Crawler run config: skip cache, use extraction
|
||||||
run_conf = CrawlerRunConfig(
|
run_conf = CrawlerRunConfig(
|
||||||
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
|
||||||
async def crawl_web():
|
async def crawl_web():
|
||||||
config = CrawlerRunConfig(bypass_cache=True)
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://en.wikipedia.org/wiki/apple",
|
url="https://en.wikipedia.org/wiki/apple",
|
||||||
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
|
||||||
async def crawl_local_file():
|
async def crawl_local_file():
|
||||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||||
file_url = f"file://{local_file_path}"
|
file_url = f"file://{local_file_path}"
|
||||||
config = CrawlerRunConfig(bypass_cache=True)
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(url=file_url, config=config)
|
result = await crawler.arun(url=file_url, config=config)
|
||||||
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
|
||||||
async def crawl_raw_html():
|
async def crawl_raw_html():
|
||||||
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
||||||
raw_html_url = f"raw:{raw_html}"
|
raw_html_url = f"raw:{raw_html}"
|
||||||
config = CrawlerRunConfig(bypass_cache=True)
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(url=raw_html_url, config=config)
|
result = await crawler.arun(url=raw_html_url, config=config)
|
||||||
@@ -3845,7 +3845,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -3856,7 +3856,7 @@ async def main():
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
# Step 1: Crawl the Web URL
|
# Step 1: Crawl the Web URL
|
||||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||||
|
|
||||||
if not result.success:
|
if not result.success:
|
||||||
@@ -3871,7 +3871,7 @@ async def main():
|
|||||||
# Step 2: Crawl from the Local HTML File
|
# Step 2: Crawl from the Local HTML File
|
||||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||||
file_url = f"file://{html_file_path.resolve()}"
|
file_url = f"file://{html_file_path.resolve()}"
|
||||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||||
|
|
||||||
if not local_result.success:
|
if not local_result.success:
|
||||||
@@ -3887,7 +3887,7 @@ async def main():
|
|||||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||||
raw_html_content = f.read()
|
raw_html_content = f.read()
|
||||||
raw_html_url = f"raw:{raw_html_content}"
|
raw_html_url = f"raw:{raw_html_content}"
|
||||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||||
|
|
||||||
if not raw_result.success:
|
if not raw_result.success:
|
||||||
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
|
|||||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -4175,8 +4175,13 @@ async def main():
|
|||||||
verbose=True
|
verbose=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=filter,
|
||||||
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
content_filter=filter
|
markdown_generator=md_generator
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
|
|||||||
```python
|
```python
|
||||||
import os, asyncio
|
import os, asyncio
|
||||||
from base64 import b64decode
|
from base64 import b64decode
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
screenshot=True,
|
||||||
|
pdf=True
|
||||||
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
||||||
cache_mode=CacheMode.BYPASS,
|
config=run_config
|
||||||
pdf=True,
|
|
||||||
screenshot=True
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
# Save screenshot
|
print(f"Screenshot data present: {result.screenshot is not None}")
|
||||||
|
print(f"PDF data present: {result.pdf is not None}")
|
||||||
|
|
||||||
if result.screenshot:
|
if result.screenshot:
|
||||||
|
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
|
||||||
with open("wikipedia_screenshot.png", "wb") as f:
|
with open("wikipedia_screenshot.png", "wb") as f:
|
||||||
f.write(b64decode(result.screenshot))
|
f.write(b64decode(result.screenshot))
|
||||||
|
else:
|
||||||
|
print("[WARN] Screenshot data is None.")
|
||||||
|
|
||||||
# Save PDF
|
|
||||||
if result.pdf:
|
if result.pdf:
|
||||||
|
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
|
||||||
with open("wikipedia_page.pdf", "wb") as f:
|
with open("wikipedia_page.pdf", "wb") as f:
|
||||||
f.write(result.pdf)
|
f.write(result.pdf)
|
||||||
|
else:
|
||||||
|
print("[WARN] PDF data is None.")
|
||||||
|
|
||||||
print("[OK] PDF & screenshot captured.")
|
|
||||||
else:
|
else:
|
||||||
print("[ERROR]", result.error_message)
|
print("[ERROR]", result.error_message)
|
||||||
|
|
||||||
|
|||||||
@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
|
|||||||
class MarkdownRequest(BaseModel):
|
class MarkdownRequest(BaseModel):
|
||||||
"""Request body for the /md endpoint."""
|
"""Request body for the /md endpoint."""
|
||||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||||
f: FilterType = Field(FilterType.FIT,
|
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||||
description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
|
|
||||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||||
|
|
||||||
|
|||||||
@@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
|
|||||||
```python
|
```python
|
||||||
import os, asyncio
|
import os, asyncio
|
||||||
from base64 import b64decode
|
from base64 import b64decode
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
screenshot=True,
|
||||||
|
pdf=True
|
||||||
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
||||||
cache_mode=CacheMode.BYPASS,
|
config=run_config
|
||||||
pdf=True,
|
|
||||||
screenshot=True
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
# Save screenshot
|
print(f"Screenshot data present: {result.screenshot is not None}")
|
||||||
|
print(f"PDF data present: {result.pdf is not None}")
|
||||||
|
|
||||||
if result.screenshot:
|
if result.screenshot:
|
||||||
|
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
|
||||||
with open("wikipedia_screenshot.png", "wb") as f:
|
with open("wikipedia_screenshot.png", "wb") as f:
|
||||||
f.write(b64decode(result.screenshot))
|
f.write(b64decode(result.screenshot))
|
||||||
|
else:
|
||||||
|
print("[WARN] Screenshot data is None.")
|
||||||
|
|
||||||
# Save PDF
|
|
||||||
if result.pdf:
|
if result.pdf:
|
||||||
|
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
|
||||||
with open("wikipedia_page.pdf", "wb") as f:
|
with open("wikipedia_page.pdf", "wb") as f:
|
||||||
f.write(result.pdf)
|
f.write(result.pdf)
|
||||||
|
else:
|
||||||
|
print("[WARN] PDF data is None.")
|
||||||
|
|
||||||
print("[OK] PDF & screenshot captured.")
|
|
||||||
else:
|
else:
|
||||||
print("[ERROR]", result.error_message)
|
print("[ERROR]", result.error_message)
|
||||||
|
|
||||||
|
|||||||
@@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`:
|
|||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import BrowserConfig
|
from crawl4ai.async_configs import BrowserConfig
|
||||||
|
|
||||||
proxy_config = {
|
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
|
||||||
"server": "http://proxy.example.com:8080",
|
|
||||||
"username": "user",
|
|
||||||
"password": "pass"
|
|
||||||
}
|
|
||||||
|
|
||||||
browser_config = BrowserConfig(proxy_config=proxy_config)
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's the corrected documentation:
|
|
||||||
|
|
||||||
## Rotating Proxies
|
## Rotating Proxies
|
||||||
|
|
||||||
Example using a proxy rotation service dynamically:
|
Example using a proxy rotation service dynamically:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
import re
|
||||||
|
from crawl4ai import (
|
||||||
async def get_next_proxy():
|
AsyncWebCrawler,
|
||||||
# Your proxy rotation logic here
|
BrowserConfig,
|
||||||
return {"server": "http://next.proxy.com:8080"}
|
CrawlerRunConfig,
|
||||||
|
CacheMode,
|
||||||
|
RoundRobinProxyStrategy,
|
||||||
|
)
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import ProxyConfig
|
||||||
async def main():
|
async def main():
|
||||||
browser_config = BrowserConfig()
|
# Load proxies and create rotation strategy
|
||||||
run_config = CrawlerRunConfig()
|
proxies = ProxyConfig.from_env()
|
||||||
|
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
|
||||||
|
if not proxies:
|
||||||
|
print("No proxies found in environment. Set PROXIES env variable!")
|
||||||
|
return
|
||||||
|
|
||||||
|
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||||
|
|
||||||
|
# Create configs
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
proxy_rotation_strategy=proxy_strategy
|
||||||
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
# For each URL, create a new run config with different proxy
|
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||||
for url in urls:
|
|
||||||
proxy = await get_next_proxy()
|
print("\n📈 Initializing crawler with proxy rotation...")
|
||||||
# Clone the config and update proxy - this creates a new browser context
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
current_config = run_config.clone(proxy_config=proxy)
|
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||||
result = await crawler.arun(url=url, config=current_config)
|
results = await crawler.arun_many(
|
||||||
|
urls=urls,
|
||||||
|
config=run_config
|
||||||
|
)
|
||||||
|
for result in results:
|
||||||
|
if result.success:
|
||||||
|
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||||
|
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||||
|
|
||||||
|
if current_proxy and ip_match:
|
||||||
|
print(f"URL {result.url}")
|
||||||
|
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
|
||||||
|
verified = ip_match.group(0) == current_proxy.ip
|
||||||
|
if verified:
|
||||||
|
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
|
||||||
|
else:
|
||||||
|
print("❌ Proxy failed or IP mismatch!")
|
||||||
|
print("---")
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import asyncio
|
|
||||||
asyncio.run(main())
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
|
||||||
from crawl4ai import JsonCssExtractionStrategy
|
from crawl4ai import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -298,7 +298,7 @@ async def main():
|
|||||||
# 3) Example LLM content filtering
|
# 3) Example LLM content filtering
|
||||||
|
|
||||||
gemini_config = LLMConfig(
|
gemini_config = LLMConfig(
|
||||||
provider="gemini/gemini-1.5-pro"
|
provider="gemini/gemini-1.5-pro",
|
||||||
api_token = "env:GEMINI_API_TOKEN"
|
api_token = "env:GEMINI_API_TOKEN"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -322,8 +322,9 @@ async def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
md_generator = DefaultMarkdownGenerator(
|
md_generator = DefaultMarkdownGenerator(
|
||||||
content_filter=filter,
|
content_filter=filter,
|
||||||
options={"ignore_links": True}
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
|
|
||||||
# 4) Crawler run config: skip cache, use extraction
|
# 4) Crawler run config: skip cache, use extraction
|
||||||
run_conf = CrawlerRunConfig(
|
run_conf = CrawlerRunConfig(
|
||||||
|
|||||||
@@ -17,6 +17,9 @@
|
|||||||
- [Configuration Reference](#configuration-reference)
|
- [Configuration Reference](#configuration-reference)
|
||||||
- [Best Practices & Tips](#best-practices--tips)
|
- [Best Practices & Tips](#best-practices--tips)
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
The Crawl4AI CLI will be installed automatically when you install the library.
|
||||||
|
|
||||||
## Basic Usage
|
## Basic Usage
|
||||||
|
|
||||||
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
|
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
|
||||||
|
|||||||
@@ -8,11 +8,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
|
||||||
async def crawl_web():
|
async def crawl_web():
|
||||||
config = CrawlerRunConfig(bypass_cache=True)
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://en.wikipedia.org/wiki/apple",
|
url="https://en.wikipedia.org/wiki/apple",
|
||||||
@@ -33,13 +33,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
|
||||||
async def crawl_local_file():
|
async def crawl_local_file():
|
||||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||||
file_url = f"file://{local_file_path}"
|
file_url = f"file://{local_file_path}"
|
||||||
config = CrawlerRunConfig(bypass_cache=True)
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(url=file_url, config=config)
|
result = await crawler.arun(url=file_url, config=config)
|
||||||
@@ -93,7 +93,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from crawl4ai import AsyncWebCrawler
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -104,7 +104,7 @@ async def main():
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
# Step 1: Crawl the Web URL
|
# Step 1: Crawl the Web URL
|
||||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||||
|
|
||||||
if not result.success:
|
if not result.success:
|
||||||
@@ -119,7 +119,7 @@ async def main():
|
|||||||
# Step 2: Crawl from the Local HTML File
|
# Step 2: Crawl from the Local HTML File
|
||||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||||
file_url = f"file://{html_file_path.resolve()}"
|
file_url = f"file://{html_file_path.resolve()}"
|
||||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||||
|
|
||||||
if not local_result.success:
|
if not local_result.success:
|
||||||
@@ -135,7 +135,7 @@ async def main():
|
|||||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||||
raw_html_content = f.read()
|
raw_html_content = f.read()
|
||||||
raw_html_url = f"raw:{raw_html_content}"
|
raw_html_url = f"raw:{raw_html_content}"
|
||||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||||
|
|
||||||
if not raw_result.success:
|
if not raw_result.success:
|
||||||
|
|||||||
@@ -201,6 +201,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
|
|||||||
- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.
|
- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.
|
||||||
- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.
|
- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.
|
||||||
- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
|
- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
|
||||||
|
- **`language (str)`**: Language for stemming (default: 'english').
|
||||||
|
|
||||||
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
|
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
|
||||||
|
|
||||||
@@ -233,7 +234,7 @@ prune_filter = PruningContentFilter(
|
|||||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -255,9 +256,12 @@ async def main():
|
|||||||
chunk_token_threshold=4096, # Adjust based on your needs
|
chunk_token_threshold=4096, # Adjust based on your needs
|
||||||
verbose=True
|
verbose=True
|
||||||
)
|
)
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=filter,
|
||||||
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
content_filter=filter
|
markdown_generator=md_generator,
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ dependencies = [
|
|||||||
"lxml~=5.3",
|
"lxml~=5.3",
|
||||||
"litellm>=1.53.1",
|
"litellm>=1.53.1",
|
||||||
"numpy>=1.26.0,<3",
|
"numpy>=1.26.0,<3",
|
||||||
"pillow~=10.4",
|
"pillow>=10.4",
|
||||||
"playwright>=1.49.0",
|
"playwright>=1.49.0",
|
||||||
"python-dotenv~=1.0",
|
"python-dotenv~=1.0",
|
||||||
"requests~=2.26",
|
"requests~=2.26",
|
||||||
@@ -32,7 +32,6 @@ dependencies = [
|
|||||||
"psutil>=6.1.1",
|
"psutil>=6.1.1",
|
||||||
"nltk>=3.9.1",
|
"nltk>=3.9.1",
|
||||||
"playwright",
|
"playwright",
|
||||||
"aiofiles",
|
|
||||||
"rich>=13.9.4",
|
"rich>=13.9.4",
|
||||||
"cssselect>=1.2.0",
|
"cssselect>=1.2.0",
|
||||||
"httpx>=0.27.2",
|
"httpx>=0.27.2",
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ aiosqlite~=0.20
|
|||||||
lxml~=5.3
|
lxml~=5.3
|
||||||
litellm>=1.53.1
|
litellm>=1.53.1
|
||||||
numpy>=1.26.0,<3
|
numpy>=1.26.0,<3
|
||||||
pillow~=10.4
|
pillow>=10.4
|
||||||
playwright>=1.49.0
|
playwright>=1.49.0
|
||||||
python-dotenv~=1.0
|
python-dotenv~=1.0
|
||||||
requests~=2.26
|
requests~=2.26
|
||||||
@@ -27,3 +27,7 @@ httpx[http2]>=0.27.2
|
|||||||
sentence-transformers>=2.2.0
|
sentence-transformers>=2.2.0
|
||||||
alphashape>=1.3.1
|
alphashape>=1.3.1
|
||||||
shapely>=2.0.0
|
shapely>=2.0.0
|
||||||
|
|
||||||
|
fake-useragent>=2.2.0
|
||||||
|
pdf2image>=1.17.0
|
||||||
|
PyPDF2>=3.0.1
|
||||||
@@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"):
|
|||||||
def test_basic_crawl(tester: Crawl4AiTester):
|
def test_basic_crawl(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Basic Crawl ===")
|
print("\n=== Testing Basic Crawl ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 10,
|
"priority": 10,
|
||||||
"session_id": "test",
|
"session_id": "test",
|
||||||
}
|
}
|
||||||
@@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
|||||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 10,
|
"priority": 10,
|
||||||
"session_id": "test",
|
"session_id": "test",
|
||||||
}
|
}
|
||||||
@@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
|
|||||||
def test_js_execution(tester: Crawl4AiTester):
|
def test_js_execution(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"js_code": [
|
"js_code": [
|
||||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||||
@@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
|||||||
def test_css_selector(tester: Crawl4AiTester):
|
def test_css_selector(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing CSS Selector ===")
|
print("\n=== Testing CSS Selector ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 7,
|
"priority": 7,
|
||||||
"css_selector": ".wide-tease-item__description",
|
"css_selector": ".wide-tease-item__description",
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
@@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.coinbase.com/explore",
|
"urls": ["https://www.coinbase.com/explore"],
|
||||||
"priority": 9,
|
"priority": 9,
|
||||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||||
}
|
}
|
||||||
@@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://openai.com/api/pricing",
|
"urls": ["https://openai.com/api/pricing"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Cosine Extraction ===")
|
print("\n=== Testing Cosine Extraction ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "cosine",
|
"type": "cosine",
|
||||||
@@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
|||||||
def test_screenshot(tester: Crawl4AiTester):
|
def test_screenshot(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Screenshot ===")
|
print("\n=== Testing Screenshot ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 5,
|
"priority": 5,
|
||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"):
|
|||||||
|
|
||||||
def test_basic_crawl(tester: Crawl4AiTester):
|
def test_basic_crawl(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Basic Crawl ===")
|
print("\n=== Testing Basic Crawl ===")
|
||||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||||
|
|
||||||
result = tester.submit_and_wait(request)
|
result = tester.submit_and_wait(request)
|
||||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||||
@@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
|||||||
def test_js_execution(tester: Crawl4AiTester):
|
def test_js_execution(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"js_code": [
|
"js_code": [
|
||||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||||
@@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
|||||||
def test_css_selector(tester: Crawl4AiTester):
|
def test_css_selector(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing CSS Selector ===")
|
print("\n=== Testing CSS Selector ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 7,
|
"priority": 7,
|
||||||
"css_selector": ".wide-tease-item__description",
|
"css_selector": ".wide-tease-item__description",
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
@@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.coinbase.com/explore",
|
"urls": ["https://www.coinbase.com/explore"],
|
||||||
"priority": 9,
|
"priority": 9,
|
||||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||||
}
|
}
|
||||||
@@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://openai.com/api/pricing",
|
"urls": ["https://openai.com/api/pricing"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
|||||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Cosine Extraction ===")
|
print("\n=== Testing Cosine Extraction ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "cosine",
|
"type": "cosine",
|
||||||
@@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
|||||||
def test_screenshot(tester: Crawl4AiTester):
|
def test_screenshot(tester: Crawl4AiTester):
|
||||||
print("\n=== Testing Screenshot ===")
|
print("\n=== Testing Screenshot ===")
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 5,
|
"priority": 5,
|
||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ class NBCNewsAPITest:
|
|||||||
async def test_basic_crawl():
|
async def test_basic_crawl():
|
||||||
print("\n=== Testing Basic Crawl ===")
|
print("\n=== Testing Basic Crawl ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||||
task_id = await api.submit_crawl(request)
|
task_id = await api.submit_crawl(request)
|
||||||
result = await api.wait_for_task(task_id)
|
result = await api.wait_for_task(task_id)
|
||||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||||
@@ -67,7 +67,7 @@ async def test_js_execution():
|
|||||||
print("\n=== Testing JS Execution ===")
|
print("\n=== Testing JS Execution ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"js_code": [
|
"js_code": [
|
||||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||||
@@ -86,7 +86,7 @@ async def test_css_selector():
|
|||||||
print("\n=== Testing CSS Selector ===")
|
print("\n=== Testing CSS Selector ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 7,
|
"priority": 7,
|
||||||
"css_selector": ".wide-tease-item__description",
|
"css_selector": ".wide-tease-item__description",
|
||||||
}
|
}
|
||||||
@@ -120,7 +120,7 @@ async def test_structured_extraction():
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 9,
|
"priority": 9,
|
||||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||||
}
|
}
|
||||||
@@ -177,7 +177,7 @@ async def test_llm_extraction():
|
|||||||
}
|
}
|
||||||
|
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 8,
|
"priority": 8,
|
||||||
"extraction_config": {
|
"extraction_config": {
|
||||||
"type": "llm",
|
"type": "llm",
|
||||||
@@ -209,7 +209,7 @@ async def test_screenshot():
|
|||||||
print("\n=== Testing Screenshot ===")
|
print("\n=== Testing Screenshot ===")
|
||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
request = {
|
request = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 5,
|
"priority": 5,
|
||||||
"screenshot": True,
|
"screenshot": True,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
@@ -227,7 +227,7 @@ async def test_priority_handling():
|
|||||||
async with NBCNewsAPITest() as api:
|
async with NBCNewsAPITest() as api:
|
||||||
# Submit low priority task first
|
# Submit low priority task first
|
||||||
low_priority = {
|
low_priority = {
|
||||||
"urls": "https://www.nbcnews.com/business",
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"priority": 1,
|
"priority": 1,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
}
|
}
|
||||||
@@ -235,7 +235,7 @@ async def test_priority_handling():
|
|||||||
|
|
||||||
# Submit high priority task
|
# Submit high priority task
|
||||||
high_priority = {
|
high_priority = {
|
||||||
"urls": "https://www.nbcnews.com/business/consumer",
|
"urls": ["https://www.nbcnews.com/business/consumer"],
|
||||||
"priority": 10,
|
"priority": 10,
|
||||||
"crawler_params": {"headless": True},
|
"crawler_params": {"headless": True},
|
||||||
}
|
}
|
||||||
|
|||||||
91
tests/test_normalize_url.py
Normal file
91
tests/test_normalize_url.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
import unittest
|
||||||
|
from crawl4ai.utils import normalize_url
|
||||||
|
|
||||||
|
class TestNormalizeUrl(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_basic_relative_path(self):
|
||||||
|
self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html")
|
||||||
|
|
||||||
|
def test_base_url_with_trailing_slash(self):
|
||||||
|
self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html")
|
||||||
|
|
||||||
|
def test_base_url_without_trailing_slash(self):
|
||||||
|
# If normalize_url correctly uses urljoin, "base" is treated as a file.
|
||||||
|
self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html")
|
||||||
|
|
||||||
|
def test_absolute_url_as_href(self):
|
||||||
|
self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html")
|
||||||
|
|
||||||
|
def test_href_with_leading_trailing_spaces(self):
|
||||||
|
self.assertEqual(normalize_url(" page.html ", "http://example.com/"), "http://example.com/page.html")
|
||||||
|
|
||||||
|
def test_empty_href(self):
|
||||||
|
# urljoin with an empty href and base ending in '/' returns the base.
|
||||||
|
self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/")
|
||||||
|
# urljoin with an empty href and base not ending in '/' also returns base.
|
||||||
|
self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base")
|
||||||
|
|
||||||
|
def test_href_with_query_parameters(self):
|
||||||
|
self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test")
|
||||||
|
|
||||||
|
def test_href_with_fragment(self):
|
||||||
|
self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section")
|
||||||
|
|
||||||
|
def test_different_scheme_in_href(self):
|
||||||
|
self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html")
|
||||||
|
|
||||||
|
def test_parent_directory_in_href(self):
|
||||||
|
self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html")
|
||||||
|
|
||||||
|
def test_root_relative_href(self):
|
||||||
|
self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html")
|
||||||
|
|
||||||
|
def test_base_url_with_path_and_no_trailing_slash(self):
|
||||||
|
# If normalize_url correctly uses urljoin, "path" is treated as a file.
|
||||||
|
self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html")
|
||||||
|
|
||||||
|
def test_base_url_is_just_domain(self):
|
||||||
|
self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html")
|
||||||
|
|
||||||
|
def test_href_is_only_query(self):
|
||||||
|
self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true")
|
||||||
|
|
||||||
|
def test_href_is_only_fragment(self):
|
||||||
|
self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment")
|
||||||
|
|
||||||
|
def test_relative_link_from_base_file_url(self):
|
||||||
|
"""
|
||||||
|
Tests the specific bug report: relative links from a base URL that is a file.
|
||||||
|
Example:
|
||||||
|
Page URL: http://example.com/path/to/document.html
|
||||||
|
Link on page: <a href="./file.xlsx">
|
||||||
|
Expected: http://example.com/path/to/file.xlsx
|
||||||
|
"""
|
||||||
|
base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml"
|
||||||
|
href_relative_current_dir = "./P020241203375994691134.xlsx"
|
||||||
|
expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx"
|
||||||
|
self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1)
|
||||||
|
|
||||||
|
# Test with a relative link that doesn't start with "./"
|
||||||
|
href_relative_no_dot_slash = "another.doc"
|
||||||
|
expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc"
|
||||||
|
self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2)
|
||||||
|
|
||||||
|
def test_invalid_base_url_scheme(self):
|
||||||
|
with self.assertRaises(ValueError) as context:
|
||||||
|
normalize_url("page.html", "ftp://example.com/")
|
||||||
|
self.assertIn("Invalid base URL format", str(context.exception))
|
||||||
|
|
||||||
|
def test_invalid_base_url_netloc(self):
|
||||||
|
with self.assertRaises(ValueError) as context:
|
||||||
|
normalize_url("page.html", "http:///path/")
|
||||||
|
self.assertIn("Invalid base URL format", str(context.exception))
|
||||||
|
|
||||||
|
def test_base_url_with_port(self):
|
||||||
|
self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html")
|
||||||
|
|
||||||
|
def test_href_with_special_characters(self):
|
||||||
|
self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user