Compare commits
29 Commits
fix/docker
...
fix/case_s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89679cee67 | ||
|
|
84ba78c852 | ||
|
|
3899ac3d3b | ||
|
|
23431d8109 | ||
|
|
1717827732 | ||
|
|
f8eaf01ed1 | ||
|
|
14b42b1f9a | ||
|
|
3bc56dd028 | ||
|
|
1874a7b8d2 | ||
|
|
0482c1eafc | ||
|
|
6a3b3e9d38 | ||
|
|
1eacea1d2d | ||
|
|
bc6d8147d2 | ||
|
|
487839640f | ||
|
|
6772134a3a | ||
|
|
ae67d66b81 | ||
|
|
af28e84a21 | ||
|
|
5e7fcb17e1 | ||
|
|
2de200c1ba | ||
|
|
9749e2832d | ||
|
|
70f473b84d | ||
|
|
bdacf61ca9 | ||
|
|
f566c5a376 | ||
|
|
4ed33fce9e | ||
|
|
f7a3366f72 | ||
|
|
2ad3fb5fc8 | ||
|
|
f2da460bb9 | ||
|
|
b1dff5a4d3 | ||
|
|
88a9fbbb7e |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -265,7 +265,7 @@ CLAUDE.md
|
|||||||
tests/**/test_site
|
tests/**/test_site
|
||||||
tests/**/reports
|
tests/**/reports
|
||||||
tests/**/benchmark_reports
|
tests/**/benchmark_reports
|
||||||
|
test_scripts/
|
||||||
docs/**/data
|
docs/**/data
|
||||||
.codecat/
|
.codecat/
|
||||||
|
|
||||||
|
|||||||
10
CHANGELOG.md
10
CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
|
||||||
|
- Maintains HTTPS scheme for internal links even when servers redirect to HTTP
|
||||||
|
- Prevents security downgrades during deep crawling
|
||||||
|
- Useful for security-conscious crawling and sites supporting both protocols
|
||||||
|
- Fully backward compatible with opt-in flag (default: `False`)
|
||||||
|
- Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
|
||||||
|
|
||||||
## [0.7.3] - 2025-08-09
|
## [0.7.3] - 2025-08-09
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ import re
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig
|
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig, LLMConfig
|
||||||
from crawl4ai.models import Link, CrawlResult
|
from crawl4ai.models import Link, CrawlResult
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -178,7 +178,7 @@ class AdaptiveConfig:
|
|||||||
|
|
||||||
# Embedding strategy parameters
|
# Embedding strategy parameters
|
||||||
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
embedding_llm_config: Optional[Dict] = None # Separate config for embeddings
|
embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None # Separate config for embeddings
|
||||||
n_query_variations: int = 10
|
n_query_variations: int = 10
|
||||||
coverage_threshold: float = 0.85
|
coverage_threshold: float = 0.85
|
||||||
alpha_shape_alpha: float = 0.5
|
alpha_shape_alpha: float = 0.5
|
||||||
@@ -250,6 +250,30 @@ class AdaptiveConfig:
|
|||||||
assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
|
assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
|
||||||
assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
|
assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
|
||||||
assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
|
assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _embedding_llm_config_dict(self) -> Optional[Dict]:
|
||||||
|
"""Convert LLMConfig to dict format for backward compatibility."""
|
||||||
|
if self.embedding_llm_config is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if isinstance(self.embedding_llm_config, dict):
|
||||||
|
# Already a dict - return as-is for backward compatibility
|
||||||
|
return self.embedding_llm_config
|
||||||
|
|
||||||
|
# Convert LLMConfig object to dict format
|
||||||
|
return {
|
||||||
|
'provider': self.embedding_llm_config.provider,
|
||||||
|
'api_token': self.embedding_llm_config.api_token,
|
||||||
|
'base_url': getattr(self.embedding_llm_config, 'base_url', None),
|
||||||
|
'temperature': getattr(self.embedding_llm_config, 'temperature', None),
|
||||||
|
'max_tokens': getattr(self.embedding_llm_config, 'max_tokens', None),
|
||||||
|
'top_p': getattr(self.embedding_llm_config, 'top_p', None),
|
||||||
|
'frequency_penalty': getattr(self.embedding_llm_config, 'frequency_penalty', None),
|
||||||
|
'presence_penalty': getattr(self.embedding_llm_config, 'presence_penalty', None),
|
||||||
|
'stop': getattr(self.embedding_llm_config, 'stop', None),
|
||||||
|
'n': getattr(self.embedding_llm_config, 'n', None),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class CrawlStrategy(ABC):
|
class CrawlStrategy(ABC):
|
||||||
@@ -593,7 +617,7 @@ class StatisticalStrategy(CrawlStrategy):
|
|||||||
class EmbeddingStrategy(CrawlStrategy):
|
class EmbeddingStrategy(CrawlStrategy):
|
||||||
"""Embedding-based adaptive crawling using semantic space coverage"""
|
"""Embedding-based adaptive crawling using semantic space coverage"""
|
||||||
|
|
||||||
def __init__(self, embedding_model: str = None, llm_config: Dict = None):
|
def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
|
||||||
self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
|
self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
self.llm_config = llm_config
|
self.llm_config = llm_config
|
||||||
self._embedding_cache = {}
|
self._embedding_cache = {}
|
||||||
@@ -605,14 +629,24 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
self._kb_embeddings_hash = None # Track KB changes
|
self._kb_embeddings_hash = None # Track KB changes
|
||||||
self._validation_embeddings_cache = None # Cache validation query embeddings
|
self._validation_embeddings_cache = None # Cache validation query embeddings
|
||||||
self._kb_similarity_threshold = 0.95 # Threshold for deduplication
|
self._kb_similarity_threshold = 0.95 # Threshold for deduplication
|
||||||
|
|
||||||
|
def _get_embedding_llm_config_dict(self) -> Dict:
|
||||||
|
"""Get embedding LLM config as dict with fallback to default."""
|
||||||
|
if hasattr(self, 'config') and self.config:
|
||||||
|
config_dict = self.config._embedding_llm_config_dict
|
||||||
|
if config_dict:
|
||||||
|
return config_dict
|
||||||
|
|
||||||
|
# Fallback to default if no config provided
|
||||||
|
return {
|
||||||
|
'provider': 'openai/text-embedding-3-small',
|
||||||
|
'api_token': os.getenv('OPENAI_API_KEY')
|
||||||
|
}
|
||||||
|
|
||||||
async def _get_embeddings(self, texts: List[str]) -> Any:
|
async def _get_embeddings(self, texts: List[str]) -> Any:
|
||||||
"""Get embeddings using configured method"""
|
"""Get embeddings using configured method"""
|
||||||
from .utils import get_text_embeddings
|
from .utils import get_text_embeddings
|
||||||
embedding_llm_config = {
|
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||||
'provider': 'openai/text-embedding-3-small',
|
|
||||||
'api_token': os.getenv('OPENAI_API_KEY')
|
|
||||||
}
|
|
||||||
return await get_text_embeddings(
|
return await get_text_embeddings(
|
||||||
texts,
|
texts,
|
||||||
embedding_llm_config,
|
embedding_llm_config,
|
||||||
@@ -679,8 +713,20 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
Return as a JSON array of strings."""
|
Return as a JSON array of strings."""
|
||||||
|
|
||||||
# Use the LLM for query generation
|
# Use the LLM for query generation
|
||||||
provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini'
|
# Convert LLMConfig to dict if needed
|
||||||
api_token = self.llm_config.get('api_token') if self.llm_config else None
|
llm_config_dict = None
|
||||||
|
if self.llm_config:
|
||||||
|
if isinstance(self.llm_config, dict):
|
||||||
|
llm_config_dict = self.llm_config
|
||||||
|
else:
|
||||||
|
# Convert LLMConfig object to dict
|
||||||
|
llm_config_dict = {
|
||||||
|
'provider': self.llm_config.provider,
|
||||||
|
'api_token': self.llm_config.api_token
|
||||||
|
}
|
||||||
|
|
||||||
|
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
||||||
|
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
||||||
|
|
||||||
# response = perform_completion_with_backoff(
|
# response = perform_completion_with_backoff(
|
||||||
# provider=provider,
|
# provider=provider,
|
||||||
@@ -843,10 +889,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
|
|
||||||
# Batch embed only uncached links
|
# Batch embed only uncached links
|
||||||
if texts_to_embed:
|
if texts_to_embed:
|
||||||
embedding_llm_config = {
|
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||||
'provider': 'openai/text-embedding-3-small',
|
|
||||||
'api_token': os.getenv('OPENAI_API_KEY')
|
|
||||||
}
|
|
||||||
new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
|
new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
|
||||||
|
|
||||||
# Cache the new embeddings
|
# Cache the new embeddings
|
||||||
@@ -1184,10 +1227,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Get embeddings for new texts
|
# Get embeddings for new texts
|
||||||
embedding_llm_config = {
|
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||||
'provider': 'openai/text-embedding-3-small',
|
|
||||||
'api_token': os.getenv('OPENAI_API_KEY')
|
|
||||||
}
|
|
||||||
new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
|
new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
|
||||||
|
|
||||||
# Deduplicate embeddings before adding to KB
|
# Deduplicate embeddings before adding to KB
|
||||||
@@ -1256,10 +1296,12 @@ class AdaptiveCrawler:
|
|||||||
if strategy_name == "statistical":
|
if strategy_name == "statistical":
|
||||||
return StatisticalStrategy()
|
return StatisticalStrategy()
|
||||||
elif strategy_name == "embedding":
|
elif strategy_name == "embedding":
|
||||||
return EmbeddingStrategy(
|
strategy = EmbeddingStrategy(
|
||||||
embedding_model=self.config.embedding_model,
|
embedding_model=self.config.embedding_model,
|
||||||
llm_config=self.config.embedding_llm_config
|
llm_config=self.config.embedding_llm_config
|
||||||
)
|
)
|
||||||
|
strategy.config = self.config # Pass config to strategy
|
||||||
|
return strategy
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown strategy: {strategy_name}")
|
raise ValueError(f"Unknown strategy: {strategy_name}")
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
import warnings
|
||||||
from .config import (
|
from .config import (
|
||||||
DEFAULT_PROVIDER,
|
DEFAULT_PROVIDER,
|
||||||
DEFAULT_PROVIDER_API_KEY,
|
DEFAULT_PROVIDER_API_KEY,
|
||||||
@@ -257,24 +258,39 @@ class ProxyConfig:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
"""Create a ProxyConfig from a string.
|
||||||
parts = proxy_str.split(":")
|
|
||||||
if len(parts) == 4: # ip:port:username:password
|
Supported formats:
|
||||||
|
- 'http://username:password@ip:port'
|
||||||
|
- 'http://ip:port'
|
||||||
|
- 'socks5://ip:port'
|
||||||
|
- 'ip:port:username:password'
|
||||||
|
- 'ip:port'
|
||||||
|
"""
|
||||||
|
s = (proxy_str or "").strip()
|
||||||
|
# URL with credentials
|
||||||
|
if "@" in s and "://" in s:
|
||||||
|
auth_part, server_part = s.split("@", 1)
|
||||||
|
protocol, credentials = auth_part.split("://", 1)
|
||||||
|
if ":" in credentials:
|
||||||
|
username, password = credentials.split(":", 1)
|
||||||
|
return ProxyConfig(
|
||||||
|
server=f"{protocol}://{server_part}",
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
)
|
||||||
|
# URL without credentials (keep scheme)
|
||||||
|
if "://" in s and "@" not in s:
|
||||||
|
return ProxyConfig(server=s)
|
||||||
|
# Colon separated forms
|
||||||
|
parts = s.split(":")
|
||||||
|
if len(parts) == 4:
|
||||||
ip, port, username, password = parts
|
ip, port, username, password = parts
|
||||||
return ProxyConfig(
|
return ProxyConfig(server=f"http://{ip}:{port}", username=username, password=password)
|
||||||
server=f"http://{ip}:{port}",
|
if len(parts) == 2:
|
||||||
username=username,
|
|
||||||
password=password,
|
|
||||||
ip=ip
|
|
||||||
)
|
|
||||||
elif len(parts) == 2: # ip:port only
|
|
||||||
ip, port = parts
|
ip, port = parts
|
||||||
return ProxyConfig(
|
return ProxyConfig(server=f"http://{ip}:{port}")
|
||||||
server=f"http://{ip}:{port}",
|
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||||
ip=ip
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||||
@@ -438,6 +454,7 @@ class BrowserConfig:
|
|||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
enable_stealth: bool = False,
|
enable_stealth: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.browser_type = browser_type
|
self.browser_type = browser_type
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.browser_mode = browser_mode
|
self.browser_mode = browser_mode
|
||||||
@@ -450,13 +467,22 @@ class BrowserConfig:
|
|||||||
if self.browser_type in ["firefox", "webkit"]:
|
if self.browser_type in ["firefox", "webkit"]:
|
||||||
self.channel = ""
|
self.channel = ""
|
||||||
self.chrome_channel = ""
|
self.chrome_channel = ""
|
||||||
|
if proxy:
|
||||||
|
warnings.warn("The 'proxy' parameter is deprecated and will be removed in a future release. Use 'proxy_config' instead.", UserWarning)
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
self.proxy_config = proxy_config
|
self.proxy_config = proxy_config
|
||||||
if isinstance(self.proxy_config, dict):
|
if isinstance(self.proxy_config, dict):
|
||||||
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
|
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
|
||||||
if isinstance(self.proxy_config, str):
|
if isinstance(self.proxy_config, str):
|
||||||
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
|
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
|
||||||
|
|
||||||
|
if self.proxy and self.proxy_config:
|
||||||
|
warnings.warn("Both 'proxy' and 'proxy_config' are provided. 'proxy_config' will take precedence.", UserWarning)
|
||||||
|
self.proxy = None
|
||||||
|
elif self.proxy:
|
||||||
|
# Convert proxy string to ProxyConfig if proxy_config is not provided
|
||||||
|
self.proxy_config = ProxyConfig.from_string(self.proxy)
|
||||||
|
self.proxy = None
|
||||||
|
|
||||||
self.viewport_width = viewport_width
|
self.viewport_width = viewport_width
|
||||||
self.viewport_height = viewport_height
|
self.viewport_height = viewport_height
|
||||||
@@ -834,12 +860,6 @@ class HTTPCrawlerConfig:
|
|||||||
return HTTPCrawlerConfig.from_kwargs(config)
|
return HTTPCrawlerConfig.from_kwargs(config)
|
||||||
|
|
||||||
class CrawlerRunConfig():
|
class CrawlerRunConfig():
|
||||||
_UNWANTED_PROPS = {
|
|
||||||
'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
|
|
||||||
'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
|
|
||||||
'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
|
|
||||||
'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
|
|
||||||
}
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Configuration class for controlling how the crawler runs each crawl operation.
|
Configuration class for controlling how the crawler runs each crawl operation.
|
||||||
@@ -1046,6 +1066,12 @@ class CrawlerRunConfig():
|
|||||||
|
|
||||||
url: str = None # This is not a compulsory parameter
|
url: str = None # This is not a compulsory parameter
|
||||||
"""
|
"""
|
||||||
|
_UNWANTED_PROPS = {
|
||||||
|
'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
|
||||||
|
'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
|
||||||
|
'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
|
||||||
|
'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -1124,6 +1150,7 @@ class CrawlerRunConfig():
|
|||||||
exclude_domains: list = None,
|
exclude_domains: list = None,
|
||||||
exclude_internal_links: bool = False,
|
exclude_internal_links: bool = False,
|
||||||
score_links: bool = False,
|
score_links: bool = False,
|
||||||
|
preserve_https_for_internal_links: bool = False,
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
log_console: bool = False,
|
log_console: bool = False,
|
||||||
@@ -1247,6 +1274,7 @@ class CrawlerRunConfig():
|
|||||||
self.exclude_domains = exclude_domains or []
|
self.exclude_domains = exclude_domains or []
|
||||||
self.exclude_internal_links = exclude_internal_links
|
self.exclude_internal_links = exclude_internal_links
|
||||||
self.score_links = score_links
|
self.score_links = score_links
|
||||||
|
self.preserve_https_for_internal_links = preserve_https_for_internal_links
|
||||||
|
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
@@ -1520,6 +1548,7 @@ class CrawlerRunConfig():
|
|||||||
exclude_domains=kwargs.get("exclude_domains", []),
|
exclude_domains=kwargs.get("exclude_domains", []),
|
||||||
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||||||
score_links=kwargs.get("score_links", False),
|
score_links=kwargs.get("score_links", False),
|
||||||
|
preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose=kwargs.get("verbose", True),
|
verbose=kwargs.get("verbose", True),
|
||||||
log_console=kwargs.get("log_console", False),
|
log_console=kwargs.get("log_console", False),
|
||||||
@@ -1626,6 +1655,7 @@ class CrawlerRunConfig():
|
|||||||
"exclude_domains": self.exclude_domains,
|
"exclude_domains": self.exclude_domains,
|
||||||
"exclude_internal_links": self.exclude_internal_links,
|
"exclude_internal_links": self.exclude_internal_links,
|
||||||
"score_links": self.score_links,
|
"score_links": self.score_links,
|
||||||
|
"preserve_https_for_internal_links": self.preserve_https_for_internal_links,
|
||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"log_console": self.log_console,
|
"log_console": self.log_console,
|
||||||
"capture_network_requests": self.capture_network_requests,
|
"capture_network_requests": self.capture_network_requests,
|
||||||
|
|||||||
@@ -354,6 +354,7 @@ class AsyncWebCrawler:
|
|||||||
###############################################################
|
###############################################################
|
||||||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||||
###############################################################
|
###############################################################
|
||||||
|
from urllib.parse import urlparse
|
||||||
crawl_result: CrawlResult = await self.aprocess_html(
|
crawl_result: CrawlResult = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=html,
|
html=html,
|
||||||
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
|
|||||||
verbose=config.verbose,
|
verbose=config.verbose,
|
||||||
is_raw_html=True if url.startswith("raw:") else False,
|
is_raw_html=True if url.startswith("raw:") else False,
|
||||||
redirected_url=async_response.redirected_url,
|
redirected_url=async_response.redirected_url,
|
||||||
|
original_scheme=urlparse(url).scheme,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from .js_snippet import load_js_script
|
|||||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .utils import get_chromium_path
|
from .utils import get_chromium_path
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
|
||||||
BROWSER_DISABLE_OPTIONS = [
|
BROWSER_DISABLE_OPTIONS = [
|
||||||
@@ -741,17 +742,18 @@ class BrowserManager:
|
|||||||
)
|
)
|
||||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||||
|
|
||||||
if self.config.proxy or self.config.proxy_config:
|
if self.config.proxy:
|
||||||
|
warnings.warn(
|
||||||
|
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
if self.config.proxy_config:
|
||||||
from playwright.async_api import ProxySettings
|
from playwright.async_api import ProxySettings
|
||||||
|
|
||||||
proxy_settings = (
|
proxy_settings = ProxySettings(
|
||||||
ProxySettings(server=self.config.proxy)
|
server=self.config.proxy_config.server,
|
||||||
if self.config.proxy
|
username=self.config.proxy_config.username,
|
||||||
else ProxySettings(
|
password=self.config.proxy_config.password,
|
||||||
server=self.config.proxy_config.server,
|
|
||||||
username=self.config.proxy_config.username,
|
|
||||||
password=self.config.proxy_config.password,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
browser_args["proxy"] = proxy_settings
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
|||||||
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
normalized_href = normalize_url(href, url)
|
normalized_href = normalize_url(
|
||||||
|
href, url,
|
||||||
|
preserve_https=kwargs.get('preserve_https_for_internal_links', False),
|
||||||
|
original_scheme=kwargs.get('original_scheme')
|
||||||
|
)
|
||||||
link_data = {
|
link_data = {
|
||||||
"href": normalized_href,
|
"href": normalized_href,
|
||||||
"text": link.text_content().strip(),
|
"text": link.text_content().strip(),
|
||||||
|
|||||||
@@ -122,11 +122,6 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
|
|
||||||
valid_links.append(base_url)
|
valid_links.append(base_url)
|
||||||
|
|
||||||
# If we have more valid links than capacity, limit them
|
|
||||||
if len(valid_links) > remaining_capacity:
|
|
||||||
valid_links = valid_links[:remaining_capacity]
|
|
||||||
self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
|
|
||||||
|
|
||||||
# Record the new depths and add to next_links
|
# Record the new depths and add to next_links
|
||||||
for url in valid_links:
|
for url in valid_links:
|
||||||
depths[url] = new_depth
|
depths[url] = new_depth
|
||||||
@@ -146,7 +141,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
"""
|
"""
|
||||||
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
|
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
|
||||||
# Push the initial URL with score 0 and depth 0.
|
# Push the initial URL with score 0 and depth 0.
|
||||||
await queue.put((0, 0, start_url, None))
|
initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
|
||||||
|
await queue.put((-initial_score, 0, start_url, None))
|
||||||
visited: Set[str] = set()
|
visited: Set[str] = set()
|
||||||
depths: Dict[str, int] = {start_url: 0}
|
depths: Dict[str, int] = {start_url: 0}
|
||||||
|
|
||||||
@@ -193,7 +189,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
result.metadata = result.metadata or {}
|
result.metadata = result.metadata or {}
|
||||||
result.metadata["depth"] = depth
|
result.metadata["depth"] = depth
|
||||||
result.metadata["parent_url"] = parent_url
|
result.metadata["parent_url"] = parent_url
|
||||||
result.metadata["score"] = score
|
result.metadata["score"] = -score
|
||||||
|
|
||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
@@ -214,7 +210,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
for new_url, new_parent in new_links:
|
for new_url, new_parent in new_links:
|
||||||
new_depth = depths.get(new_url, depth + 1)
|
new_depth = depths.get(new_url, depth + 1)
|
||||||
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
|
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
|
||||||
await queue.put((new_score, new_depth, new_url, new_parent))
|
await queue.put((-new_score, new_depth, new_url, new_parent))
|
||||||
|
|
||||||
# End of crawl.
|
# End of crawl.
|
||||||
|
|
||||||
|
|||||||
@@ -1790,6 +1790,10 @@ def perform_completion_with_backoff(
|
|||||||
except RateLimitError as e:
|
except RateLimitError as e:
|
||||||
print("Rate limit error:", str(e))
|
print("Rate limit error:", str(e))
|
||||||
|
|
||||||
|
if attempt == max_attempts - 1:
|
||||||
|
# Last attempt failed, raise the error.
|
||||||
|
raise
|
||||||
|
|
||||||
# Check if we have exhausted our max attempts
|
# Check if we have exhausted our max attempts
|
||||||
if attempt < max_attempts - 1:
|
if attempt < max_attempts - 1:
|
||||||
# Calculate the delay and wait
|
# Calculate the delay and wait
|
||||||
@@ -2146,7 +2150,9 @@ def normalize_url(
|
|||||||
drop_query_tracking=True,
|
drop_query_tracking=True,
|
||||||
sort_query=True,
|
sort_query=True,
|
||||||
keep_fragment=False,
|
keep_fragment=False,
|
||||||
extra_drop_params=None
|
extra_drop_params=None,
|
||||||
|
preserve_https=False,
|
||||||
|
original_scheme=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Extended URL normalizer
|
Extended URL normalizer
|
||||||
@@ -2171,17 +2177,36 @@ def normalize_url(
|
|||||||
str | None
|
str | None
|
||||||
A clean, canonical URL or None if href is empty/None.
|
A clean, canonical URL or None if href is empty/None.
|
||||||
"""
|
"""
|
||||||
if not href:
|
if not href or not href.strip():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Resolve relative paths first
|
# Resolve relative paths first
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Parse once, edit parts, then rebuild
|
# Parse once, edit parts, then rebuild
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
# ── netloc ──
|
# ── netloc ──
|
||||||
netloc = parsed.netloc.lower()
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Remove default ports
|
||||||
|
if ':' in netloc:
|
||||||
|
host, port = netloc.rsplit(':', 1)
|
||||||
|
if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'):
|
||||||
|
netloc = host
|
||||||
|
else:
|
||||||
|
netloc = f"{host}:{port}"
|
||||||
|
|
||||||
# ── path ──
|
# ── path ──
|
||||||
# Strip duplicate slashes and trailing "/" (except root)
|
# Strip duplicate slashes and trailing "/" (except root)
|
||||||
@@ -2195,21 +2220,25 @@ def normalize_url(
|
|||||||
query = parsed.query
|
query = parsed.query
|
||||||
if query:
|
if query:
|
||||||
# explode, mutate, then rebuild
|
# explode, mutate, then rebuild
|
||||||
params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
|
params = list(parse_qsl(query, keep_blank_values=True)) # Parse query string into key-value pairs, preserving blank values
|
||||||
|
|
||||||
if drop_query_tracking:
|
if drop_query_tracking:
|
||||||
|
# Define default tracking parameters to remove for cleaner URLs
|
||||||
default_tracking = {
|
default_tracking = {
|
||||||
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term',
|
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term',
|
||||||
'utm_content', 'gclid', 'fbclid', 'ref', 'ref_src'
|
'utm_content', 'gclid', 'fbclid', 'ref', 'ref_src'
|
||||||
}
|
}
|
||||||
if extra_drop_params:
|
if extra_drop_params:
|
||||||
default_tracking |= {p.lower() for p in extra_drop_params}
|
default_tracking |= {p.lower() for p in extra_drop_params} # Add any extra parameters to drop, case-insensitive
|
||||||
params = [(k, v) for k, v in params if k not in default_tracking]
|
params = [(k, v) for k, v in params if k not in default_tracking] # Filter out tracking parameters
|
||||||
|
|
||||||
|
# Normalize parameter keys
|
||||||
|
params = [(k, v) for k, v in params]
|
||||||
|
|
||||||
if sort_query:
|
if sort_query:
|
||||||
params.sort(key=lambda kv: kv[0])
|
params.sort(key=lambda kv: kv[0]) # Sort parameters alphabetically by key (now lowercase)
|
||||||
|
|
||||||
query = urlencode(params, doseq=True) if params else ''
|
query = urlencode(params, doseq=True) if params else '' # Rebuild query string, handling sequences properly
|
||||||
|
|
||||||
# ── fragment ──
|
# ── fragment ──
|
||||||
fragment = parsed.fragment if keep_fragment else ''
|
fragment = parsed.fragment if keep_fragment else ''
|
||||||
@@ -2227,7 +2256,7 @@ def normalize_url(
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def normalize_url_for_deep_crawl(href, base_url):
|
def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
"""Normalize URLs to ensure consistent format"""
|
||||||
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||||
|
|
||||||
@@ -2238,6 +2267,17 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
# Use urljoin to handle relative URLs
|
# Use urljoin to handle relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Parse the URL for normalization
|
# Parse the URL for normalization
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
@@ -2275,7 +2315,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
"""Efficient URL normalization with proper parsing"""
|
"""Efficient URL normalization with proper parsing"""
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
@@ -2285,6 +2325,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
|||||||
# Resolve relative URLs
|
# Resolve relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Use proper URL parsing
|
# Use proper URL parsing
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
|||||||
@@ -413,6 +413,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
|||||||
server_memory_mb = _get_memory_mb()
|
server_memory_mb = _get_memory_mb()
|
||||||
result_dict = result.model_dump()
|
result_dict = result.model_dump()
|
||||||
result_dict['server_memory_mb'] = server_memory_mb
|
result_dict['server_memory_mb'] = server_memory_mb
|
||||||
|
# Ensure fit_html is JSON-serializable
|
||||||
|
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
|
||||||
|
result_dict["fit_html"] = None
|
||||||
# If PDF exists, encode it to base64
|
# If PDF exists, encode it to base64
|
||||||
if result_dict.get('pdf') is not None:
|
if result_dict.get('pdf') is not None:
|
||||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||||
@@ -493,6 +496,9 @@ async def handle_crawl_request(
|
|||||||
processed_results = []
|
processed_results = []
|
||||||
for result in results:
|
for result in results:
|
||||||
result_dict = result.model_dump()
|
result_dict = result.model_dump()
|
||||||
|
# if fit_html is not a string, set it to None to avoid serialization errors
|
||||||
|
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
|
||||||
|
result_dict["fit_html"] = None
|
||||||
# If PDF exists, encode it to base64
|
# If PDF exists, encode it to base64
|
||||||
if result_dict.get('pdf') is not None:
|
if result_dict.get('pdf') is not None:
|
||||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||||
|
|||||||
@@ -7520,17 +7520,18 @@ class BrowserManager:
|
|||||||
)
|
)
|
||||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||||
|
|
||||||
if self.config.proxy or self.config.proxy_config:
|
if self.config.proxy:
|
||||||
|
warnings.warn(
|
||||||
|
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
if self.config.proxy_config:
|
||||||
from playwright.async_api import ProxySettings
|
from playwright.async_api import ProxySettings
|
||||||
|
|
||||||
proxy_settings = (
|
proxy_settings = ProxySettings(
|
||||||
ProxySettings(server=self.config.proxy)
|
server=self.config.proxy_config.server,
|
||||||
if self.config.proxy
|
username=self.config.proxy_config.username,
|
||||||
else ProxySettings(
|
password=self.config.proxy_config.password,
|
||||||
server=self.config.proxy_config.server,
|
|
||||||
username=self.config.proxy_config.username,
|
|
||||||
password=self.config.proxy_config.password,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
browser_args["proxy"] = proxy_settings
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
|||||||
@@ -2241,7 +2241,7 @@ docker build -t crawl4ai
|
|||||||
|
|
||||||
| Argument | Description | Default | Options |
|
| Argument | Description | Default | Options |
|
||||||
|----------|-------------|---------|----------|
|
|----------|-------------|---------|----------|
|
||||||
| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
|
| PYTHON_VERSION | Python version | 3.10 | 3.10, 3.11, 3.12, 3.13 |
|
||||||
| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
|
| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
|
||||||
| ENABLE_GPU | GPU support | false | true, false |
|
| ENABLE_GPU | GPU support | false | true, false |
|
||||||
| APP_HOME | Install path | /app | any valid path |
|
| APP_HOME | Install path | /app | any valid path |
|
||||||
|
|||||||
@@ -267,12 +267,26 @@ async def generate_html(
|
|||||||
Use when you need sanitized HTML structures for building schemas or further processing.
|
Use when you need sanitized HTML structures for building schemas or further processing.
|
||||||
"""
|
"""
|
||||||
cfg = CrawlerRunConfig()
|
cfg = CrawlerRunConfig()
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
try:
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||||
raw_html = results[0].html
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
from crawl4ai.utils import preprocess_html_for_schema
|
# Check if the crawl was successful
|
||||||
processed_html = preprocess_html_for_schema(raw_html)
|
if not results[0].success:
|
||||||
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=results[0].error_message or "Crawl failed"
|
||||||
|
)
|
||||||
|
|
||||||
|
raw_html = results[0].html
|
||||||
|
from crawl4ai.utils import preprocess_html_for_schema
|
||||||
|
processed_html = preprocess_html_for_schema(raw_html)
|
||||||
|
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
|
||||||
|
except Exception as e:
|
||||||
|
# Log and raise as HTTP 500 for other exceptions
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
# Screenshot endpoint
|
# Screenshot endpoint
|
||||||
|
|
||||||
@@ -290,18 +304,29 @@ async def generate_screenshot(
|
|||||||
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
|
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
|
||||||
Then in result instead of the screenshot you will get a path to the saved file.
|
Then in result instead of the screenshot you will get a path to the saved file.
|
||||||
"""
|
"""
|
||||||
cfg = CrawlerRunConfig(
|
try:
|
||||||
screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
|
cfg = CrawlerRunConfig(
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||||
screenshot_data = results[0].screenshot
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
if body.output_path:
|
if not results[0].success:
|
||||||
abs_path = os.path.abspath(body.output_path)
|
raise HTTPException(
|
||||||
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
|
status_code=500,
|
||||||
with open(abs_path, "wb") as f:
|
detail=results[0].error_message or "Crawl failed"
|
||||||
f.write(base64.b64decode(screenshot_data))
|
)
|
||||||
return {"success": True, "path": abs_path}
|
screenshot_data = results[0].screenshot
|
||||||
return {"success": True, "screenshot": screenshot_data}
|
if body.output_path:
|
||||||
|
abs_path = os.path.abspath(body.output_path)
|
||||||
|
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
|
||||||
|
with open(abs_path, "wb") as f:
|
||||||
|
f.write(base64.b64decode(screenshot_data))
|
||||||
|
return {"success": True, "path": abs_path}
|
||||||
|
return {"success": True, "screenshot": screenshot_data}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
# PDF endpoint
|
# PDF endpoint
|
||||||
|
|
||||||
@@ -319,17 +344,28 @@ async def generate_pdf(
|
|||||||
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
|
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
|
||||||
Then in result instead of the PDF you will get a path to the saved file.
|
Then in result instead of the PDF you will get a path to the saved file.
|
||||||
"""
|
"""
|
||||||
cfg = CrawlerRunConfig(pdf=True)
|
try:
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
cfg = CrawlerRunConfig(pdf=True)
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||||
pdf_data = results[0].pdf
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
if body.output_path:
|
if not results[0].success:
|
||||||
abs_path = os.path.abspath(body.output_path)
|
raise HTTPException(
|
||||||
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
|
status_code=500,
|
||||||
with open(abs_path, "wb") as f:
|
detail=results[0].error_message or "Crawl failed"
|
||||||
f.write(pdf_data)
|
)
|
||||||
return {"success": True, "path": abs_path}
|
pdf_data = results[0].pdf
|
||||||
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
if body.output_path:
|
||||||
|
abs_path = os.path.abspath(body.output_path)
|
||||||
|
os.makedirs(os.path.dirname(abs_path), exist_ok=True)
|
||||||
|
with open(abs_path, "wb") as f:
|
||||||
|
f.write(pdf_data)
|
||||||
|
return {"success": True, "path": abs_path}
|
||||||
|
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/execute_js")
|
@app.post("/execute_js")
|
||||||
@@ -385,12 +421,23 @@ async def execute_js(
|
|||||||
```
|
```
|
||||||
|
|
||||||
"""
|
"""
|
||||||
cfg = CrawlerRunConfig(js_code=body.scripts)
|
try:
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
cfg = CrawlerRunConfig(js_code=body.scripts)
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
||||||
# Return JSON-serializable dict of the first CrawlResult
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
data = results[0].model_dump()
|
if not results[0].success:
|
||||||
return JSONResponse(data)
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=results[0].error_message or "Crawl failed"
|
||||||
|
)
|
||||||
|
# Return JSON-serializable dict of the first CrawlResult
|
||||||
|
data = results[0].model_dump()
|
||||||
|
return JSONResponse(data)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/llm/{url:path}")
|
@app.get("/llm/{url:path}")
|
||||||
@@ -435,16 +482,24 @@ async def crawl(
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Crawl a list of URLs and return the results as JSON.
|
Crawl a list of URLs and return the results as JSON.
|
||||||
|
For streaming responses, use /crawl/stream endpoint.
|
||||||
"""
|
"""
|
||||||
if not crawl_request.urls:
|
if not crawl_request.urls:
|
||||||
raise HTTPException(400, "At least one URL required")
|
raise HTTPException(400, "At least one URL required")
|
||||||
res = await handle_crawl_request(
|
# Check whether it is a redirection for a streaming request
|
||||||
|
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
|
||||||
|
if crawler_config.stream:
|
||||||
|
return await stream_process(crawl_request=crawl_request)
|
||||||
|
results = await handle_crawl_request(
|
||||||
urls=crawl_request.urls,
|
urls=crawl_request.urls,
|
||||||
browser_config=crawl_request.browser_config,
|
browser_config=crawl_request.browser_config,
|
||||||
crawler_config=crawl_request.crawler_config,
|
crawler_config=crawl_request.crawler_config,
|
||||||
config=config,
|
config=config,
|
||||||
)
|
)
|
||||||
return JSONResponse(res)
|
# check if all of the results are not successful
|
||||||
|
if all(not result["success"] for result in results["results"]):
|
||||||
|
raise HTTPException(500, f"Crawl request failed: {results['results'][0]['error_message']}")
|
||||||
|
return JSONResponse(results)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/crawl/stream")
|
@app.post("/crawl/stream")
|
||||||
@@ -456,12 +511,16 @@ async def crawl_stream(
|
|||||||
):
|
):
|
||||||
if not crawl_request.urls:
|
if not crawl_request.urls:
|
||||||
raise HTTPException(400, "At least one URL required")
|
raise HTTPException(400, "At least one URL required")
|
||||||
|
|
||||||
|
return await stream_process(crawl_request=crawl_request)
|
||||||
|
|
||||||
|
async def stream_process(crawl_request: CrawlRequest):
|
||||||
crawler, gen = await handle_stream_crawl_request(
|
crawler, gen = await handle_stream_crawl_request(
|
||||||
urls=crawl_request.urls,
|
urls=crawl_request.urls,
|
||||||
browser_config=crawl_request.browser_config,
|
browser_config=crawl_request.browser_config,
|
||||||
crawler_config=crawl_request.crawler_config,
|
crawler_config=crawl_request.crawler_config,
|
||||||
config=config,
|
config=config,
|
||||||
)
|
)
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
stream_results(crawler, gen),
|
stream_results(crawler, gen),
|
||||||
media_type="application/x-ndjson",
|
media_type="application/x-ndjson",
|
||||||
|
|||||||
@@ -371,7 +371,7 @@
|
|||||||
|
|
||||||
<div class="flex items-center">
|
<div class="flex items-center">
|
||||||
<input id="st-stream" type="checkbox" class="mr-2">
|
<input id="st-stream" type="checkbox" class="mr-2">
|
||||||
<label for="st-stream" class="text-sm">Use /crawl/stream</label>
|
<label for="st-stream" class="text-sm">Enable streaming mode</label>
|
||||||
<button id="st-run"
|
<button id="st-run"
|
||||||
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
||||||
Run Stress Test
|
Run Stress Test
|
||||||
@@ -596,6 +596,14 @@
|
|||||||
forceHighlightElement(curlCodeEl);
|
forceHighlightElement(curlCodeEl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Detect if stream is requested inside payload
|
||||||
|
function shouldUseStream(payload) {
|
||||||
|
const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
|
||||||
|
const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
|
||||||
|
const direct = payload && payload.stream;
|
||||||
|
return toBool(fromCrawler) || toBool(direct);
|
||||||
|
}
|
||||||
|
|
||||||
// Main run function
|
// Main run function
|
||||||
async function runCrawl() {
|
async function runCrawl() {
|
||||||
const endpoint = document.getElementById('endpoint').value;
|
const endpoint = document.getElementById('endpoint').value;
|
||||||
@@ -611,16 +619,24 @@
|
|||||||
: { browser_config: cfgJson };
|
: { browser_config: cfgJson };
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
updateStatus('error');
|
const codeText = cm.getValue();
|
||||||
document.querySelector('#response-content code').textContent =
|
const streamFlag = /stream\s*=\s*True/i.test(codeText);
|
||||||
JSON.stringify({ error: err.message }, null, 2);
|
const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
|
||||||
forceHighlightElement(document.querySelector('#response-content code'));
|
if (isCrawlEndpoint && streamFlag) {
|
||||||
return; // stop run
|
// Fallback: proceed with minimal config only for stream
|
||||||
|
advConfig = { crawler_config: { stream: true } };
|
||||||
|
} else {
|
||||||
|
updateStatus('error');
|
||||||
|
document.querySelector('#response-content code').textContent =
|
||||||
|
JSON.stringify({ error: err.message }, null, 2);
|
||||||
|
forceHighlightElement(document.querySelector('#response-content code'));
|
||||||
|
return; // stop run
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const endpointMap = {
|
const endpointMap = {
|
||||||
crawl: '/crawl',
|
crawl: '/crawl',
|
||||||
// crawl_stream: '/crawl/stream',
|
crawl_stream: '/crawl/stream', // Keep for backward compatibility
|
||||||
md: '/md',
|
md: '/md',
|
||||||
llm: '/llm'
|
llm: '/llm'
|
||||||
};
|
};
|
||||||
@@ -647,7 +663,7 @@
|
|||||||
// This will be handled directly in the fetch below
|
// This will be handled directly in the fetch below
|
||||||
payload = null;
|
payload = null;
|
||||||
} else {
|
} else {
|
||||||
// Default payload for /crawl and /crawl/stream
|
// Default payload for /crawl (supports both streaming and batch modes)
|
||||||
payload = {
|
payload = {
|
||||||
urls,
|
urls,
|
||||||
...advConfig
|
...advConfig
|
||||||
@@ -659,6 +675,7 @@
|
|||||||
try {
|
try {
|
||||||
const startTime = performance.now();
|
const startTime = performance.now();
|
||||||
let response, responseData;
|
let response, responseData;
|
||||||
|
const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
|
||||||
|
|
||||||
if (endpoint === 'llm') {
|
if (endpoint === 'llm') {
|
||||||
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
||||||
@@ -681,8 +698,8 @@
|
|||||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||||
document.querySelector('#response-content code').className = 'json hljs';
|
document.querySelector('#response-content code').className = 'json hljs';
|
||||||
forceHighlightElement(document.querySelector('#response-content code'));
|
forceHighlightElement(document.querySelector('#response-content code'));
|
||||||
} else if (endpoint === 'crawl_stream') {
|
} else if (endpoint === 'crawl_stream' || useStreamOverride) {
|
||||||
// Stream processing
|
// Stream processing - now handled directly by /crawl endpoint
|
||||||
response = await fetch(api, {
|
response = await fetch(api, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
@@ -757,6 +774,7 @@
|
|||||||
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
||||||
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
||||||
} else {
|
} else {
|
||||||
|
// Use the same API endpoint for both streaming and non-streaming
|
||||||
generateSnippets(api, payload);
|
generateSnippets(api, payload);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -786,7 +804,7 @@
|
|||||||
document.getElementById('stress-avg-time').textContent = '0';
|
document.getElementById('stress-avg-time').textContent = '0';
|
||||||
document.getElementById('stress-peak-mem').textContent = '0';
|
document.getElementById('stress-peak-mem').textContent = '0';
|
||||||
|
|
||||||
const api = useStream ? '/crawl/stream' : '/crawl';
|
const api = '/crawl'; // Always use /crawl - backend handles streaming internally
|
||||||
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
|
|
||||||
|
|||||||
154
docs/examples/adaptive_crawling/llm_config_example.py
Normal file
154
docs/examples/adaptive_crawling/llm_config_example.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
||||||
|
"""Test a specific configuration"""
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Configuration: {name}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
|
result = await adaptive.digest(start_url=url, query=query)
|
||||||
|
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("CRAWL STATISTICS")
|
||||||
|
print("="*50)
|
||||||
|
adaptive.print_stats(detailed=False)
|
||||||
|
|
||||||
|
# Get the most relevant content found
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("MOST RELEVANT PAGES")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||||
|
for i, page in enumerate(relevant_pages, 1):
|
||||||
|
print(f"\n{i}. {page['url']}")
|
||||||
|
print(f" Relevance Score: {page['score']:.2%}")
|
||||||
|
|
||||||
|
# Show a snippet of the content
|
||||||
|
content = page['content'] or ""
|
||||||
|
if content:
|
||||||
|
snippet = content[:200].replace('\n', ' ')
|
||||||
|
if len(content) > 200:
|
||||||
|
snippet += "..."
|
||||||
|
print(f" Preview: {snippet}")
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"Pages crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||||
|
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
||||||
|
|
||||||
|
if result.metrics.get('is_irrelevant', False):
|
||||||
|
print("⚠️ Query detected as irrelevant!")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def llm_embedding():
|
||||||
|
"""Demonstrate various embedding configurations"""
|
||||||
|
|
||||||
|
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Base URL and query for testing
|
||||||
|
test_url = "https://docs.python.org/3/library/asyncio.html"
|
||||||
|
|
||||||
|
openai_llm_config = LLMConfig(
|
||||||
|
provider='openai/text-embedding-3-small',
|
||||||
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
|
temperature=0.7,
|
||||||
|
max_tokens=2000
|
||||||
|
)
|
||||||
|
config_openai = AdaptiveConfig(
|
||||||
|
strategy="embedding",
|
||||||
|
max_pages=10,
|
||||||
|
|
||||||
|
# Use OpenAI embeddings
|
||||||
|
embedding_llm_config=openai_llm_config,
|
||||||
|
# embedding_llm_config={
|
||||||
|
# 'provider': 'openai/text-embedding-3-small',
|
||||||
|
# 'api_token': os.getenv('OPENAI_API_KEY')
|
||||||
|
# },
|
||||||
|
|
||||||
|
# OpenAI embeddings are high quality, can be stricter
|
||||||
|
embedding_k_exp=4.0,
|
||||||
|
n_query_variations=12
|
||||||
|
)
|
||||||
|
|
||||||
|
await test_configuration(
|
||||||
|
"OpenAI Embeddings",
|
||||||
|
config_openai,
|
||||||
|
test_url,
|
||||||
|
# "event-driven architecture patterns"
|
||||||
|
"async await context managers coroutines"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def basic_adaptive_crawling():
|
||||||
|
"""Basic adaptive crawling example"""
|
||||||
|
|
||||||
|
# Initialize the crawler
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
# Create an adaptive crawler with default settings (statistical strategy)
|
||||||
|
adaptive = AdaptiveCrawler(crawler)
|
||||||
|
|
||||||
|
# Note: You can also use embedding strategy for semantic understanding:
|
||||||
|
# from crawl4ai import AdaptiveConfig
|
||||||
|
# config = AdaptiveConfig(strategy="embedding")
|
||||||
|
# adaptive = AdaptiveCrawler(crawler, config)
|
||||||
|
|
||||||
|
# Start adaptive crawling
|
||||||
|
print("Starting adaptive crawl for Python async programming information...")
|
||||||
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||||
|
query="async await context managers coroutines"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display crawl statistics
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("CRAWL STATISTICS")
|
||||||
|
print("="*50)
|
||||||
|
adaptive.print_stats(detailed=False)
|
||||||
|
|
||||||
|
# Get the most relevant content found
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("MOST RELEVANT PAGES")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||||
|
for i, page in enumerate(relevant_pages, 1):
|
||||||
|
print(f"\n{i}. {page['url']}")
|
||||||
|
print(f" Relevance Score: {page['score']:.2%}")
|
||||||
|
|
||||||
|
# Show a snippet of the content
|
||||||
|
content = page['content'] or ""
|
||||||
|
if content:
|
||||||
|
snippet = content[:200].replace('\n', ' ')
|
||||||
|
if len(content) > 200:
|
||||||
|
snippet += "..."
|
||||||
|
print(f" Preview: {snippet}")
|
||||||
|
|
||||||
|
# Show final confidence
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
||||||
|
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
|
||||||
|
if adaptive.confidence >= 0.8:
|
||||||
|
print("✓ High confidence - can answer detailed questions about async Python")
|
||||||
|
elif adaptive.confidence >= 0.6:
|
||||||
|
print("~ Moderate confidence - can answer basic questions")
|
||||||
|
else:
|
||||||
|
print("✗ Low confidence - need more information")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(llm_embedding())
|
||||||
|
# asyncio.run(basic_adaptive_crawling())
|
||||||
221
docs/examples/website-to-api/.gitignore
vendored
Normal file
221
docs/examples/website-to-api/.gitignore
vendored
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[codz]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
#uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
#poetry.toml
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
||||||
|
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
||||||
|
#pdm.lock
|
||||||
|
#pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# pixi
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
||||||
|
#pixi.lock
|
||||||
|
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
||||||
|
# in the .venv directory. It is recommended not to include this directory in version control.
|
||||||
|
.pixi
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# Redis
|
||||||
|
*.rdb
|
||||||
|
*.aof
|
||||||
|
*.pid
|
||||||
|
|
||||||
|
# RabbitMQ
|
||||||
|
mnesia/
|
||||||
|
rabbitmq/
|
||||||
|
rabbitmq-data/
|
||||||
|
|
||||||
|
# ActiveMQ
|
||||||
|
activemq-data/
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.envrc
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
# Abstra
|
||||||
|
# Abstra is an AI-powered process automation framework.
|
||||||
|
# Ignore directories containing user credentials, local state, and settings.
|
||||||
|
# Learn more at https://abstra.io/docs
|
||||||
|
.abstra/
|
||||||
|
|
||||||
|
# Visual Studio Code
|
||||||
|
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
||||||
|
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
||||||
|
# you could uncomment the following to ignore the entire vscode folder
|
||||||
|
# .vscode/
|
||||||
|
|
||||||
|
# Ruff stuff:
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# PyPI configuration file
|
||||||
|
.pypirc
|
||||||
|
|
||||||
|
# Marimo
|
||||||
|
marimo/_static/
|
||||||
|
marimo/_lsp/
|
||||||
|
__marimo__/
|
||||||
|
|
||||||
|
# Streamlit
|
||||||
|
.streamlit/secrets.toml
|
||||||
|
|
||||||
|
#directories
|
||||||
|
models
|
||||||
|
schemas
|
||||||
|
saved_requests
|
||||||
252
docs/examples/website-to-api/README.md
Normal file
252
docs/examples/website-to-api/README.md
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
# Web Scraper API with Custom Model Support
|
||||||
|
|
||||||
|
A powerful web scraping API that converts any website into structured data using AI. Features a beautiful minimalist frontend interface and support for custom LLM models!
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **AI-Powered Scraping**: Provide a URL and plain English query to extract structured data
|
||||||
|
- **Beautiful Frontend**: Modern minimalist black-and-white interface with smooth UX
|
||||||
|
- **Custom Model Support**: Use any LLM provider (OpenAI, Gemini, Anthropic, etc.) with your own API keys
|
||||||
|
- **Model Management**: Save, list, and manage multiple model configurations via web interface
|
||||||
|
- **Dual Scraping Approaches**: Choose between Schema-based (faster) or LLM-based (more flexible) extraction
|
||||||
|
- **API Request History**: Automatic saving and display of all API requests with cURL commands
|
||||||
|
- **Schema Caching**: Intelligent caching of generated schemas for faster subsequent requests
|
||||||
|
- **Duplicate Prevention**: Avoids saving duplicate requests (same URL + query)
|
||||||
|
- **RESTful API**: Easy-to-use HTTP endpoints for all operations
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Install Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Start the API Server
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The server will start on `http://localhost:8000` with a beautiful web interface!
|
||||||
|
|
||||||
|
### 3. Using the Web Interface
|
||||||
|
|
||||||
|
Once the server is running, open your browser and go to `http://localhost:8000` to access the modern web interface!
|
||||||
|
|
||||||
|
#### Pages:
|
||||||
|
- **Scrape Data**: Enter URLs and queries to extract structured data
|
||||||
|
- **Models**: Manage your AI model configurations (add, list, delete)
|
||||||
|
- **API Requests**: View history of all scraping requests with cURL commands
|
||||||
|
|
||||||
|
#### Features:
|
||||||
|
- **Minimalist Design**: Clean black-and-white theme inspired by modern web apps
|
||||||
|
- **Real-time Results**: See extracted data in formatted JSON
|
||||||
|
- **Copy to Clipboard**: Easy copying of results
|
||||||
|
- **Toast Notifications**: User-friendly feedback
|
||||||
|
- **Dual Scraping Modes**: Choose between Schema-based and LLM-based approaches
|
||||||
|
|
||||||
|
## Model Management
|
||||||
|
|
||||||
|
### Adding Models via Web Interface
|
||||||
|
|
||||||
|
1. Go to the **Models** page
|
||||||
|
2. Enter your model details:
|
||||||
|
- **Provider**: LLM provider (e.g., `gemini/gemini-2.5-flash`, `openai/gpt-4o`)
|
||||||
|
- **API Token**: Your API key for the provider
|
||||||
|
3. Click "Add Model"
|
||||||
|
|
||||||
|
### API Usage for Model Management
|
||||||
|
|
||||||
|
#### Save a Model Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST "http://localhost:8000/models" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"provider": "gemini/gemini-2.5-flash",
|
||||||
|
"api_token": "your-api-key-here"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
#### List Saved Models
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X GET "http://localhost:8000/models"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Delete a Model Configuration
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X DELETE "http://localhost:8000/models/my-gemini"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scraping Approaches
|
||||||
|
|
||||||
|
### 1. Schema-based Scraping (Faster)
|
||||||
|
- Generates CSS selectors for targeted extraction
|
||||||
|
- Caches schemas for repeated requests
|
||||||
|
- Faster execution for structured websites
|
||||||
|
|
||||||
|
### 2. LLM-based Scraping (More Flexible)
|
||||||
|
- Direct LLM extraction without schema generation
|
||||||
|
- More flexible for complex or dynamic content
|
||||||
|
- Better for unstructured data extraction
|
||||||
|
|
||||||
|
## Supported LLM Providers
|
||||||
|
|
||||||
|
The API supports any LLM provider that crawl4ai supports, including:
|
||||||
|
|
||||||
|
- **Google Gemini**: `gemini/gemini-2.5-flash`, `gemini/gemini-pro`
|
||||||
|
- **OpenAI**: `openai/gpt-4`, `openai/gpt-3.5-turbo`
|
||||||
|
- **Anthropic**: `anthropic/claude-3-opus`, `anthropic/claude-3-sonnet`
|
||||||
|
- **And more...**
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Core Endpoints
|
||||||
|
|
||||||
|
- `POST /scrape` - Schema-based scraping
|
||||||
|
- `POST /scrape-with-llm` - LLM-based scraping
|
||||||
|
- `GET /schemas` - List cached schemas
|
||||||
|
- `POST /clear-cache` - Clear schema cache
|
||||||
|
- `GET /health` - Health check
|
||||||
|
|
||||||
|
### Model Management Endpoints
|
||||||
|
|
||||||
|
- `GET /models` - List saved model configurations
|
||||||
|
- `POST /models` - Save a new model configuration
|
||||||
|
- `DELETE /models/{model_name}` - Delete a model configuration
|
||||||
|
|
||||||
|
### API Request History
|
||||||
|
|
||||||
|
- `GET /saved-requests` - List all saved API requests
|
||||||
|
- `DELETE /saved-requests/{request_id}` - Delete a saved request
|
||||||
|
|
||||||
|
## Request/Response Examples
|
||||||
|
|
||||||
|
### Scrape Request
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"query": "Extract the product name, price, and description",
|
||||||
|
"model_name": "my-custom-model"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scrape Response
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"url": "https://example.com",
|
||||||
|
"query": "Extract the product name, price, and description",
|
||||||
|
"extracted_data": {
|
||||||
|
"product_name": "Example Product",
|
||||||
|
"price": "$99.99",
|
||||||
|
"description": "This is an example product description"
|
||||||
|
},
|
||||||
|
"schema_used": { ... },
|
||||||
|
"timestamp": "2024-01-01T12:00:00Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Model Configuration Request
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"provider": "gemini/gemini-2.5-flash",
|
||||||
|
"api_token": "your-api-key-here"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
Run the test script to verify the model management functionality:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python test_models.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
parse_example/
|
||||||
|
├── api_server.py # FastAPI server with all endpoints
|
||||||
|
├── web_scraper_lib.py # Core scraping library
|
||||||
|
├── test_models.py # Test script for model management
|
||||||
|
├── requirements.txt # Dependencies
|
||||||
|
├── static/ # Frontend files
|
||||||
|
│ ├── index.html # Main HTML interface
|
||||||
|
│ ├── styles.css # CSS styles (minimalist theme)
|
||||||
|
│ └── script.js # JavaScript functionality
|
||||||
|
├── schemas/ # Cached schemas
|
||||||
|
├── models/ # Saved model configurations
|
||||||
|
├── saved_requests/ # API request history
|
||||||
|
└── README.md # This file
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Usage
|
||||||
|
|
||||||
|
### Using the Library Directly
|
||||||
|
|
||||||
|
```python
|
||||||
|
from web_scraper_lib import WebScraperAgent
|
||||||
|
|
||||||
|
# Initialize agent
|
||||||
|
agent = WebScraperAgent()
|
||||||
|
|
||||||
|
# Save a model configuration
|
||||||
|
agent.save_model_config(
|
||||||
|
model_name="my-model",
|
||||||
|
provider="openai/gpt-4",
|
||||||
|
api_token="your-api-key"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Schema-based scraping
|
||||||
|
result = await agent.scrape_data(
|
||||||
|
url="https://example.com",
|
||||||
|
query="Extract product information",
|
||||||
|
model_name="my-model"
|
||||||
|
)
|
||||||
|
|
||||||
|
# LLM-based scraping
|
||||||
|
result = await agent.scrape_data_with_llm(
|
||||||
|
url="https://example.com",
|
||||||
|
query="Extract product information",
|
||||||
|
model_name="my-model"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Schema Caching
|
||||||
|
|
||||||
|
The system automatically caches generated schemas based on URL and query combinations:
|
||||||
|
|
||||||
|
- **First request**: Generates schema using AI
|
||||||
|
- **Subsequent requests**: Uses cached schema for faster extraction
|
||||||
|
|
||||||
|
### API Request History
|
||||||
|
|
||||||
|
All API requests are automatically saved with:
|
||||||
|
- Request details (URL, query, model used)
|
||||||
|
- Response data
|
||||||
|
- Timestamp
|
||||||
|
- cURL command for re-execution
|
||||||
|
|
||||||
|
### Duplicate Prevention
|
||||||
|
|
||||||
|
The system prevents saving duplicate requests:
|
||||||
|
- Same URL + query combinations are not saved multiple times
|
||||||
|
- Returns existing request ID for duplicates
|
||||||
|
- Keeps the API request history clean
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
The API provides detailed error messages for common issues:
|
||||||
|
|
||||||
|
- Invalid URLs
|
||||||
|
- Missing model configurations
|
||||||
|
- API key errors
|
||||||
|
- Network timeouts
|
||||||
|
- Parsing errors
|
||||||
363
docs/examples/website-to-api/api_server.py
Normal file
363
docs/examples/website-to-api/api_server.py
Normal file
@@ -0,0 +1,363 @@
|
|||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from fastapi.staticfiles import StaticFiles
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
from pydantic import BaseModel, HttpUrl
|
||||||
|
from typing import Dict, Any, Optional, Union, List
|
||||||
|
import uvicorn
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from web_scraper_lib import WebScraperAgent, scrape_website
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Web Scraper API",
|
||||||
|
description="Convert any website into a structured data API. Provide a URL and tell AI what data you need in plain English.",
|
||||||
|
version="1.0.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mount static files
|
||||||
|
if os.path.exists("static"):
|
||||||
|
app.mount("/static", StaticFiles(directory="static"), name="static")
|
||||||
|
|
||||||
|
# Mount assets directory
|
||||||
|
if os.path.exists("assets"):
|
||||||
|
app.mount("/assets", StaticFiles(directory="assets"), name="assets")
|
||||||
|
|
||||||
|
# Initialize the scraper agent
|
||||||
|
scraper_agent = WebScraperAgent()
|
||||||
|
|
||||||
|
# Create directory for saved API requests
|
||||||
|
os.makedirs("saved_requests", exist_ok=True)
|
||||||
|
|
||||||
|
class ScrapeRequest(BaseModel):
|
||||||
|
url: HttpUrl
|
||||||
|
query: str
|
||||||
|
model_name: Optional[str] = None
|
||||||
|
|
||||||
|
class ModelConfigRequest(BaseModel):
|
||||||
|
model_name: str
|
||||||
|
provider: str
|
||||||
|
api_token: str
|
||||||
|
|
||||||
|
class ScrapeResponse(BaseModel):
|
||||||
|
success: bool
|
||||||
|
url: str
|
||||||
|
query: str
|
||||||
|
extracted_data: Union[Dict[str, Any], list]
|
||||||
|
schema_used: Optional[Dict[str, Any]] = None
|
||||||
|
timestamp: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class SavedApiRequest(BaseModel):
|
||||||
|
id: str
|
||||||
|
endpoint: str
|
||||||
|
method: str
|
||||||
|
headers: Dict[str, str]
|
||||||
|
body: Dict[str, Any]
|
||||||
|
timestamp: str
|
||||||
|
response: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
def save_api_request(endpoint: str, method: str, headers: Dict[str, str], body: Dict[str, Any], response: Optional[Dict[str, Any]] = None) -> str:
|
||||||
|
"""Save an API request to a JSON file."""
|
||||||
|
|
||||||
|
# Check for duplicate requests (same URL and query)
|
||||||
|
if endpoint in ["/scrape", "/scrape-with-llm"] and "url" in body and "query" in body:
|
||||||
|
existing_requests = get_saved_requests()
|
||||||
|
for existing_request in existing_requests:
|
||||||
|
if (existing_request.endpoint == endpoint and
|
||||||
|
existing_request.body.get("url") == body["url"] and
|
||||||
|
existing_request.body.get("query") == body["query"]):
|
||||||
|
print(f"Duplicate request found for URL: {body['url']} and query: {body['query']}")
|
||||||
|
return existing_request.id # Return existing request ID instead of creating new one
|
||||||
|
|
||||||
|
request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
|
||||||
|
|
||||||
|
saved_request = SavedApiRequest(
|
||||||
|
id=request_id,
|
||||||
|
endpoint=endpoint,
|
||||||
|
method=method,
|
||||||
|
headers=headers,
|
||||||
|
body=body,
|
||||||
|
timestamp=datetime.now().isoformat(),
|
||||||
|
response=response
|
||||||
|
)
|
||||||
|
|
||||||
|
file_path = os.path.join("saved_requests", f"{request_id}.json")
|
||||||
|
with open(file_path, "w") as f:
|
||||||
|
json.dump(saved_request.dict(), f, indent=2)
|
||||||
|
|
||||||
|
return request_id
|
||||||
|
|
||||||
|
def get_saved_requests() -> List[SavedApiRequest]:
|
||||||
|
"""Get all saved API requests."""
|
||||||
|
requests = []
|
||||||
|
if os.path.exists("saved_requests"):
|
||||||
|
for filename in os.listdir("saved_requests"):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
file_path = os.path.join("saved_requests", filename)
|
||||||
|
try:
|
||||||
|
with open(file_path, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
requests.append(SavedApiRequest(**data))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading saved request {filename}: {e}")
|
||||||
|
|
||||||
|
# Sort by timestamp (newest first)
|
||||||
|
requests.sort(key=lambda x: x.timestamp, reverse=True)
|
||||||
|
return requests
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
"""Serve the frontend interface."""
|
||||||
|
if os.path.exists("static/index.html"):
|
||||||
|
return FileResponse("static/index.html")
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"message": "Web Scraper API",
|
||||||
|
"description": "Convert any website into structured data with AI",
|
||||||
|
"endpoints": {
|
||||||
|
"/scrape": "POST - Scrape data from a website",
|
||||||
|
"/schemas": "GET - List cached schemas",
|
||||||
|
"/clear-cache": "POST - Clear schema cache",
|
||||||
|
"/models": "GET - List saved model configurations",
|
||||||
|
"/models": "POST - Save a new model configuration",
|
||||||
|
"/models/{model_name}": "DELETE - Delete a model configuration",
|
||||||
|
"/saved-requests": "GET - List saved API requests"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@app.post("/scrape", response_model=ScrapeResponse)
|
||||||
|
async def scrape_website_endpoint(request: ScrapeRequest):
|
||||||
|
"""
|
||||||
|
Scrape structured data from any website.
|
||||||
|
|
||||||
|
This endpoint:
|
||||||
|
1. Takes a URL and plain English query
|
||||||
|
2. Generates a custom scraper using AI
|
||||||
|
3. Returns structured data
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Save the API request
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
body = {
|
||||||
|
"url": str(request.url),
|
||||||
|
"query": request.query,
|
||||||
|
"model_name": request.model_name
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await scraper_agent.scrape_data(
|
||||||
|
url=str(request.url),
|
||||||
|
query=request.query,
|
||||||
|
model_name=request.model_name
|
||||||
|
)
|
||||||
|
|
||||||
|
response_data = ScrapeResponse(
|
||||||
|
success=True,
|
||||||
|
url=result["url"],
|
||||||
|
query=result["query"],
|
||||||
|
extracted_data=result["extracted_data"],
|
||||||
|
schema_used=result["schema_used"],
|
||||||
|
timestamp=result["timestamp"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save the request with response
|
||||||
|
save_api_request(
|
||||||
|
endpoint="/scrape",
|
||||||
|
method="POST",
|
||||||
|
headers=headers,
|
||||||
|
body=body,
|
||||||
|
response=response_data.dict()
|
||||||
|
)
|
||||||
|
|
||||||
|
return response_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Save the failed request
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
body = {
|
||||||
|
"url": str(request.url),
|
||||||
|
"query": request.query,
|
||||||
|
"model_name": request.model_name
|
||||||
|
}
|
||||||
|
|
||||||
|
save_api_request(
|
||||||
|
endpoint="/scrape",
|
||||||
|
method="POST",
|
||||||
|
headers=headers,
|
||||||
|
body=body,
|
||||||
|
response={"error": str(e)}
|
||||||
|
)
|
||||||
|
|
||||||
|
raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
|
||||||
|
|
||||||
|
@app.post("/scrape-with-llm", response_model=ScrapeResponse)
|
||||||
|
async def scrape_website_endpoint_with_llm(request: ScrapeRequest):
|
||||||
|
"""
|
||||||
|
Scrape structured data from any website using a custom LLM model.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Save the API request
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
body = {
|
||||||
|
"url": str(request.url),
|
||||||
|
"query": request.query,
|
||||||
|
"model_name": request.model_name
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await scraper_agent.scrape_data_with_llm(
|
||||||
|
url=str(request.url),
|
||||||
|
query=request.query,
|
||||||
|
model_name=request.model_name
|
||||||
|
)
|
||||||
|
|
||||||
|
response_data = ScrapeResponse(
|
||||||
|
success=True,
|
||||||
|
url=result["url"],
|
||||||
|
query=result["query"],
|
||||||
|
extracted_data=result["extracted_data"],
|
||||||
|
timestamp=result["timestamp"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save the request with response
|
||||||
|
save_api_request(
|
||||||
|
endpoint="/scrape-with-llm",
|
||||||
|
method="POST",
|
||||||
|
headers=headers,
|
||||||
|
body=body,
|
||||||
|
response=response_data.dict()
|
||||||
|
)
|
||||||
|
|
||||||
|
return response_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Save the failed request
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
body = {
|
||||||
|
"url": str(request.url),
|
||||||
|
"query": request.query,
|
||||||
|
"model_name": request.model_name
|
||||||
|
}
|
||||||
|
|
||||||
|
save_api_request(
|
||||||
|
endpoint="/scrape-with-llm",
|
||||||
|
method="POST",
|
||||||
|
headers=headers,
|
||||||
|
body=body,
|
||||||
|
response={"error": str(e)}
|
||||||
|
)
|
||||||
|
|
||||||
|
raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
|
||||||
|
|
||||||
|
@app.get("/saved-requests")
|
||||||
|
async def list_saved_requests():
|
||||||
|
"""List all saved API requests."""
|
||||||
|
try:
|
||||||
|
requests = get_saved_requests()
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"requests": [req.dict() for req in requests],
|
||||||
|
"count": len(requests)
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to list saved requests: {str(e)}")
|
||||||
|
|
||||||
|
@app.delete("/saved-requests/{request_id}")
|
||||||
|
async def delete_saved_request(request_id: str):
|
||||||
|
"""Delete a saved API request."""
|
||||||
|
try:
|
||||||
|
file_path = os.path.join("saved_requests", f"{request_id}.json")
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
os.remove(file_path)
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": f"Saved request '{request_id}' deleted successfully"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Saved request '{request_id}' not found")
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to delete saved request: {str(e)}")
|
||||||
|
|
||||||
|
@app.get("/schemas")
|
||||||
|
async def list_cached_schemas():
|
||||||
|
"""List all cached schemas."""
|
||||||
|
try:
|
||||||
|
schemas = await scraper_agent.get_cached_schemas()
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"cached_schemas": schemas,
|
||||||
|
"count": len(schemas)
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to list schemas: {str(e)}")
|
||||||
|
|
||||||
|
@app.post("/clear-cache")
|
||||||
|
async def clear_schema_cache():
|
||||||
|
"""Clear all cached schemas."""
|
||||||
|
try:
|
||||||
|
scraper_agent.clear_cache()
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": "Schema cache cleared successfully"
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to clear cache: {str(e)}")
|
||||||
|
|
||||||
|
@app.get("/models")
|
||||||
|
async def list_models():
|
||||||
|
"""List all saved model configurations."""
|
||||||
|
try:
|
||||||
|
models = scraper_agent.list_saved_models()
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"models": models,
|
||||||
|
"count": len(models)
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to list models: {str(e)}")
|
||||||
|
|
||||||
|
@app.post("/models")
|
||||||
|
async def save_model_config(request: ModelConfigRequest):
|
||||||
|
"""Save a new model configuration."""
|
||||||
|
try:
|
||||||
|
success = scraper_agent.save_model_config(
|
||||||
|
model_name=request.model_name,
|
||||||
|
provider=request.provider,
|
||||||
|
api_token=request.api_token
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": f"Model configuration '{request.model_name}' saved successfully"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to save model configuration")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to save model: {str(e)}")
|
||||||
|
|
||||||
|
@app.delete("/models/{model_name}")
|
||||||
|
async def delete_model_config(model_name: str):
|
||||||
|
"""Delete a model configuration."""
|
||||||
|
try:
|
||||||
|
success = scraper_agent.delete_model_config(model_name)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"message": f"Model configuration '{model_name}' deleted successfully"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Model configuration '{model_name}' not found")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Failed to delete model: {str(e)}")
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health_check():
|
||||||
|
"""Health check endpoint."""
|
||||||
|
return {"status": "healthy", "service": "web-scraper-api"}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
49
docs/examples/website-to-api/app.py
Normal file
49
docs/examples/website-to-api/app.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Startup script for the Web Scraper API with frontend interface.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import uvicorn
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Check if static directory exists
|
||||||
|
static_dir = Path("static")
|
||||||
|
if not static_dir.exists():
|
||||||
|
print("❌ Static directory not found!")
|
||||||
|
print("Please make sure the 'static' directory exists with the frontend files.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check if required frontend files exist
|
||||||
|
required_files = ["index.html", "styles.css", "script.js"]
|
||||||
|
missing_files = []
|
||||||
|
|
||||||
|
for file in required_files:
|
||||||
|
if not (static_dir / file).exists():
|
||||||
|
missing_files.append(file)
|
||||||
|
|
||||||
|
if missing_files:
|
||||||
|
print(f"❌ Missing frontend files: {', '.join(missing_files)}")
|
||||||
|
print("Please make sure all frontend files are present in the static directory.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print("🚀 Starting Web Scraper API with Frontend Interface")
|
||||||
|
print("=" * 50)
|
||||||
|
print("📁 Static files found and ready to serve")
|
||||||
|
print("🌐 Frontend will be available at: http://localhost:8000")
|
||||||
|
print("🔌 API endpoints available at: http://localhost:8000/docs")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# Start the server
|
||||||
|
uvicorn.run(
|
||||||
|
"api_server:app",
|
||||||
|
host="0.0.0.0",
|
||||||
|
port=8000,
|
||||||
|
reload=True,
|
||||||
|
log_level="info"
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
BIN
docs/examples/website-to-api/assets/crawl4ai_logo.jpg
Normal file
BIN
docs/examples/website-to-api/assets/crawl4ai_logo.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.8 KiB |
5
docs/examples/website-to-api/requirements.txt
Normal file
5
docs/examples/website-to-api/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
crawl4ai
|
||||||
|
fastapi
|
||||||
|
uvicorn
|
||||||
|
pydantic
|
||||||
|
litellm
|
||||||
201
docs/examples/website-to-api/static/index.html
Normal file
201
docs/examples/website-to-api/static/index.html
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Web2API Example</title>
|
||||||
|
<link rel="stylesheet" href="/static/styles.css">
|
||||||
|
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<!-- Header -->
|
||||||
|
<header class="header">
|
||||||
|
<div class="header-content">
|
||||||
|
<div class="logo">
|
||||||
|
<img src="/assets/crawl4ai_logo.jpg" alt="Crawl4AI Logo" class="logo-image">
|
||||||
|
<span>Web2API Example</span>
|
||||||
|
</div>
|
||||||
|
<nav class="nav-links">
|
||||||
|
<a href="#" class="nav-link active" data-page="scrape">Scrape</a>
|
||||||
|
<a href="#" class="nav-link" data-page="models">Models</a>
|
||||||
|
<a href="#" class="nav-link" data-page="requests">API Requests</a>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<!-- Main Content -->
|
||||||
|
<main class="main-content">
|
||||||
|
<!-- Scrape Page -->
|
||||||
|
<div id="scrape-page" class="page active">
|
||||||
|
<div class="hero-section">
|
||||||
|
<h1 class="hero-title">Turn Any Website Into An API</h1>
|
||||||
|
<p class="hero-subtitle">This example shows how to turn any website into an API using Crawl4AI.</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Workflow Demonstration -->
|
||||||
|
<div class="workflow-demo">
|
||||||
|
<div class="workflow-step">
|
||||||
|
<h3 class="step-title">1. Your Request</h3>
|
||||||
|
<div class="request-box">
|
||||||
|
<div class="input-group">
|
||||||
|
<label>URL:</label>
|
||||||
|
<input type="url" id="url" name="url" placeholder="https://example-bookstore.com/new-releases" required>
|
||||||
|
</div>
|
||||||
|
<div class="input-group">
|
||||||
|
<label>QUERY:</label>
|
||||||
|
<textarea id="query" name="query" placeholder="Extract all the book titles, their authors, and the biography of the author" required></textarea>
|
||||||
|
</div>
|
||||||
|
<div class="form-options">
|
||||||
|
<div class="option-group">
|
||||||
|
<label for="scraping-approach">Approach:</label>
|
||||||
|
<select id="scraping-approach" name="scraping_approach">
|
||||||
|
<option value="llm">LLM-based (More Flexible)</option>
|
||||||
|
<option value="schema">Schema-based (Uses LLM once!)</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="option-group">
|
||||||
|
<label for="model-select">Model:</label>
|
||||||
|
<select id="model-select" name="model_name" required>
|
||||||
|
<option value="">Select a Model</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<button type="submit" id="extract-btn" class="extract-btn">
|
||||||
|
<i class="fas fa-magic"></i>
|
||||||
|
Extract Data
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="workflow-arrow">→</div>
|
||||||
|
|
||||||
|
<div class="workflow-step">
|
||||||
|
<h3 class="step-title">2. Your Instant API & Data</h3>
|
||||||
|
<div class="response-container">
|
||||||
|
<div class="api-request-box">
|
||||||
|
<label>API Request (cURL):</label>
|
||||||
|
<pre id="curl-example">curl -X POST http://localhost:8000/scrape -H "Content-Type: application/json" -d '{"url": "...", "query": "..."}'
|
||||||
|
|
||||||
|
# Or for LLM-based approach:
|
||||||
|
curl -X POST http://localhost:8000/scrape-with-llm -H "Content-Type: application/json" -d '{"url": "...", "query": "..."}'</pre>
|
||||||
|
</div>
|
||||||
|
<div class="json-response-box">
|
||||||
|
<label>JSON Response:</label>
|
||||||
|
<pre id="json-output">{
|
||||||
|
"success": true,
|
||||||
|
"extracted_data": [
|
||||||
|
{
|
||||||
|
"title": "Example Book",
|
||||||
|
"author": "John Doe",
|
||||||
|
"description": "A great book..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}</pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Results Section -->
|
||||||
|
<div id="results-section" class="results-section" style="display: none;">
|
||||||
|
<div class="results-header">
|
||||||
|
<h2>Extracted Data</h2>
|
||||||
|
<button id="copy-json" class="copy-btn">
|
||||||
|
<i class="fas fa-copy"></i>
|
||||||
|
Copy JSON
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="results-content">
|
||||||
|
<div class="result-info">
|
||||||
|
<div class="info-item">
|
||||||
|
<span class="label">URL:</span>
|
||||||
|
<span id="result-url" class="value"></span>
|
||||||
|
</div>
|
||||||
|
<div class="info-item">
|
||||||
|
<span class="label">Query:</span>
|
||||||
|
<span id="result-query" class="value"></span>
|
||||||
|
</div>
|
||||||
|
<div class="info-item">
|
||||||
|
<span class="label">Model Used:</span>
|
||||||
|
<span id="result-model" class="value"></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="json-display">
|
||||||
|
<pre id="actual-json-output"></pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Loading State -->
|
||||||
|
<div id="loading" class="loading" style="display: none;">
|
||||||
|
<div class="spinner"></div>
|
||||||
|
<p>AI is analyzing the website and extracting data...</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Models Page -->
|
||||||
|
<div id="models-page" class="page">
|
||||||
|
<div class="models-header">
|
||||||
|
<h1>Model Configuration</h1>
|
||||||
|
<p>Configure and manage your AI model configurations</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="models-container">
|
||||||
|
<!-- Add New Model Form -->
|
||||||
|
<div class="model-form-section">
|
||||||
|
<h3>Add New Model</h3>
|
||||||
|
<form id="model-form" class="model-form">
|
||||||
|
<div class="form-row">
|
||||||
|
<div class="input-group">
|
||||||
|
<label for="model-name">Model Name:</label>
|
||||||
|
<input type="text" id="model-name" name="model_name" placeholder="my-gemini" required>
|
||||||
|
</div>
|
||||||
|
<div class="input-group">
|
||||||
|
<label for="provider">Provider:</label>
|
||||||
|
<input type="text" id="provider" name="provider" placeholder="gemini/gemini-2.5-flash" required>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="input-group">
|
||||||
|
<label for="api-token">API Token:</label>
|
||||||
|
<input type="password" id="api-token" name="api_token" placeholder="Enter your API token" required>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<button type="submit" class="save-btn">
|
||||||
|
<i class="fas fa-save"></i>
|
||||||
|
Save Model
|
||||||
|
</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Saved Models List -->
|
||||||
|
<div class="saved-models-section">
|
||||||
|
<h3>Saved Models</h3>
|
||||||
|
<div id="models-list" class="models-list">
|
||||||
|
<!-- Models will be loaded here -->
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- API Requests Page -->
|
||||||
|
<div id="requests-page" class="page">
|
||||||
|
<div class="requests-header">
|
||||||
|
<h1>Saved API Requests</h1>
|
||||||
|
<p>View and manage your previous API requests</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="requests-container">
|
||||||
|
<div class="requests-list" id="requests-list">
|
||||||
|
<!-- Saved requests will be loaded here -->
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
|
||||||
|
<!-- Toast Notifications -->
|
||||||
|
<div id="toast-container" class="toast-container"></div>
|
||||||
|
|
||||||
|
<script src="/static/script.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
401
docs/examples/website-to-api/static/script.js
Normal file
401
docs/examples/website-to-api/static/script.js
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
// API Configuration
|
||||||
|
const API_BASE_URL = 'http://localhost:8000';
|
||||||
|
|
||||||
|
// DOM Elements
|
||||||
|
const navLinks = document.querySelectorAll('.nav-link');
|
||||||
|
const pages = document.querySelectorAll('.page');
|
||||||
|
const scrapeForm = document.getElementById('scrape-form');
|
||||||
|
const modelForm = document.getElementById('model-form');
|
||||||
|
const modelSelect = document.getElementById('model-select');
|
||||||
|
const modelsList = document.getElementById('models-list');
|
||||||
|
const resultsSection = document.getElementById('results-section');
|
||||||
|
const loadingSection = document.getElementById('loading');
|
||||||
|
const copyJsonBtn = document.getElementById('copy-json');
|
||||||
|
|
||||||
|
// Navigation
|
||||||
|
navLinks.forEach(link => {
|
||||||
|
link.addEventListener('click', (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
const targetPage = link.dataset.page;
|
||||||
|
|
||||||
|
// Update active nav link
|
||||||
|
navLinks.forEach(l => l.classList.remove('active'));
|
||||||
|
link.classList.add('active');
|
||||||
|
|
||||||
|
// Show target page
|
||||||
|
pages.forEach(page => page.classList.remove('active'));
|
||||||
|
document.getElementById(`${targetPage}-page`).classList.add('active');
|
||||||
|
|
||||||
|
// Load data for the page
|
||||||
|
if (targetPage === 'models') {
|
||||||
|
loadModels();
|
||||||
|
} else if (targetPage === 'requests') {
|
||||||
|
loadSavedRequests();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Scrape Form Handler
|
||||||
|
document.getElementById('extract-btn').addEventListener('click', async (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
|
||||||
|
// Scroll to results section immediately when button is clicked
|
||||||
|
document.getElementById('results-section').scrollIntoView({
|
||||||
|
behavior: 'smooth',
|
||||||
|
block: 'start'
|
||||||
|
});
|
||||||
|
|
||||||
|
const url = document.getElementById('url').value;
|
||||||
|
const query = document.getElementById('query').value;
|
||||||
|
const headless = true; // Always use headless mode
|
||||||
|
const model_name = document.getElementById('model-select').value || null;
|
||||||
|
const scraping_approach = document.getElementById('scraping-approach').value;
|
||||||
|
|
||||||
|
if (!url || !query) {
|
||||||
|
showToast('Please fill in both URL and query fields', 'error');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!model_name) {
|
||||||
|
showToast('Please select a model from the dropdown or add one from the Models page', 'error');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = {
|
||||||
|
url: url,
|
||||||
|
query: query,
|
||||||
|
headless: headless,
|
||||||
|
model_name: model_name
|
||||||
|
};
|
||||||
|
|
||||||
|
// Show loading state
|
||||||
|
showLoading(true);
|
||||||
|
hideResults();
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Choose endpoint based on scraping approach
|
||||||
|
const endpoint = scraping_approach === 'llm' ? '/scrape-with-llm' : '/scrape';
|
||||||
|
|
||||||
|
const response = await fetch(`${API_BASE_URL}${endpoint}`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify(data)
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
displayResults(result);
|
||||||
|
showToast(`Data extracted successfully using ${scraping_approach === 'llm' ? 'LLM-based' : 'Schema-based'} approach!`, 'success');
|
||||||
|
} else {
|
||||||
|
throw new Error(result.detail || 'Failed to extract data');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Scraping error:', error);
|
||||||
|
showToast(`Error: ${error.message}`, 'error');
|
||||||
|
} finally {
|
||||||
|
showLoading(false);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Model Form Handler
|
||||||
|
modelForm.addEventListener('submit', async (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
|
||||||
|
const formData = new FormData(modelForm);
|
||||||
|
const data = {
|
||||||
|
model_name: formData.get('model_name'),
|
||||||
|
provider: formData.get('provider'),
|
||||||
|
api_token: formData.get('api_token')
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/models`, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify(data)
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
showToast('Model saved successfully!', 'success');
|
||||||
|
modelForm.reset();
|
||||||
|
loadModels();
|
||||||
|
loadModelSelect();
|
||||||
|
} else {
|
||||||
|
throw new Error(result.detail || 'Failed to save model');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Model save error:', error);
|
||||||
|
showToast(`Error: ${error.message}`, 'error');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Copy JSON Button
|
||||||
|
copyJsonBtn.addEventListener('click', () => {
|
||||||
|
const actualJsonOutput = document.getElementById('actual-json-output');
|
||||||
|
const textToCopy = actualJsonOutput.textContent;
|
||||||
|
|
||||||
|
navigator.clipboard.writeText(textToCopy).then(() => {
|
||||||
|
showToast('JSON copied to clipboard!', 'success');
|
||||||
|
}).catch(() => {
|
||||||
|
showToast('Failed to copy JSON', 'error');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Load Models
|
||||||
|
async function loadModels() {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/models`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
displayModels(result.models);
|
||||||
|
} else {
|
||||||
|
throw new Error(result.detail || 'Failed to load models');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Load models error:', error);
|
||||||
|
showToast(`Error: ${error.message}`, 'error');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Display Models
|
||||||
|
function displayModels(models) {
|
||||||
|
if (models.length === 0) {
|
||||||
|
modelsList.innerHTML = '<p style="text-align: center; color: #7f8c8d; padding: 2rem;">No models saved yet. Add your first model above!</p>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
modelsList.innerHTML = models.map(model => `
|
||||||
|
<div class="model-card">
|
||||||
|
<div class="model-info">
|
||||||
|
<div class="model-name">${model}</div>
|
||||||
|
<div class="model-provider">Model Configuration</div>
|
||||||
|
</div>
|
||||||
|
<div class="model-actions">
|
||||||
|
<button class="btn btn-danger" onclick="deleteModel('${model}')">
|
||||||
|
<i class="fas fa-trash"></i>
|
||||||
|
Delete
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete Model
|
||||||
|
async function deleteModel(modelName) {
|
||||||
|
if (!confirm(`Are you sure you want to delete the model "${modelName}"?`)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/models/${modelName}`, {
|
||||||
|
method: 'DELETE'
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
showToast('Model deleted successfully!', 'success');
|
||||||
|
loadModels();
|
||||||
|
loadModelSelect();
|
||||||
|
} else {
|
||||||
|
throw new Error(result.detail || 'Failed to delete model');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Delete model error:', error);
|
||||||
|
showToast(`Error: ${error.message}`, 'error');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load Model Select Options
|
||||||
|
async function loadModelSelect() {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/models`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
// Clear existing options
|
||||||
|
modelSelect.innerHTML = '<option value="">Select a Model</option>';
|
||||||
|
|
||||||
|
// Add model options
|
||||||
|
result.models.forEach(model => {
|
||||||
|
const option = document.createElement('option');
|
||||||
|
option.value = model;
|
||||||
|
option.textContent = model;
|
||||||
|
modelSelect.appendChild(option);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Load model select error:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Display Results
|
||||||
|
function displayResults(result) {
|
||||||
|
// Update result info
|
||||||
|
document.getElementById('result-url').textContent = result.url;
|
||||||
|
document.getElementById('result-query').textContent = result.query;
|
||||||
|
document.getElementById('result-model').textContent = result.model_name || 'Default Model';
|
||||||
|
|
||||||
|
// Display JSON in the actual results section
|
||||||
|
const actualJsonOutput = document.getElementById('actual-json-output');
|
||||||
|
actualJsonOutput.textContent = JSON.stringify(result.extracted_data, null, 2);
|
||||||
|
|
||||||
|
// Don't update the sample JSON in the workflow demo - keep it as example
|
||||||
|
|
||||||
|
// Update the cURL example based on the approach used
|
||||||
|
const scraping_approach = document.getElementById('scraping-approach').value;
|
||||||
|
const endpoint = scraping_approach === 'llm' ? '/scrape-with-llm' : '/scrape';
|
||||||
|
const curlExample = document.getElementById('curl-example');
|
||||||
|
curlExample.textContent = `curl -X POST http://localhost:8000${endpoint} -H "Content-Type: application/json" -d '{"url": "${result.url}", "query": "${result.query}"}'`;
|
||||||
|
|
||||||
|
// Show results section
|
||||||
|
resultsSection.style.display = 'block';
|
||||||
|
resultsSection.scrollIntoView({ behavior: 'smooth' });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show/Hide Loading
|
||||||
|
function showLoading(show) {
|
||||||
|
loadingSection.style.display = show ? 'block' : 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hide Results
|
||||||
|
function hideResults() {
|
||||||
|
resultsSection.style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Toast Notifications
|
||||||
|
function showToast(message, type = 'info') {
|
||||||
|
const toastContainer = document.getElementById('toast-container');
|
||||||
|
const toast = document.createElement('div');
|
||||||
|
toast.className = `toast ${type}`;
|
||||||
|
|
||||||
|
const icon = type === 'success' ? 'fas fa-check-circle' :
|
||||||
|
type === 'error' ? 'fas fa-exclamation-circle' :
|
||||||
|
'fas fa-info-circle';
|
||||||
|
|
||||||
|
toast.innerHTML = `
|
||||||
|
<i class="${icon}"></i>
|
||||||
|
<span>${message}</span>
|
||||||
|
`;
|
||||||
|
|
||||||
|
toastContainer.appendChild(toast);
|
||||||
|
|
||||||
|
// Auto remove after 5 seconds
|
||||||
|
setTimeout(() => {
|
||||||
|
toast.remove();
|
||||||
|
}, 5000);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load Saved Requests
|
||||||
|
async function loadSavedRequests() {
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/saved-requests`);
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
displaySavedRequests(result.requests);
|
||||||
|
} else {
|
||||||
|
throw new Error(result.detail || 'Failed to load saved requests');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Load saved requests error:', error);
|
||||||
|
showToast(`Error: ${error.message}`, 'error');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Display Saved Requests
|
||||||
|
function displaySavedRequests(requests) {
|
||||||
|
const requestsList = document.getElementById('requests-list');
|
||||||
|
|
||||||
|
if (requests.length === 0) {
|
||||||
|
requestsList.innerHTML = '<p style="text-align: center; color: #CCCCCC; padding: 2rem;">No saved API requests yet. Make your first request from the Scrape page!</p>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
requestsList.innerHTML = requests.map(request => {
|
||||||
|
const url = request.body.url;
|
||||||
|
const query = request.body.query;
|
||||||
|
const model = request.body.model_name || 'Default Model';
|
||||||
|
const endpoint = request.endpoint;
|
||||||
|
|
||||||
|
// Create curl command
|
||||||
|
const curlCommand = `curl -X POST http://localhost:8000${endpoint} \\
|
||||||
|
-H "Content-Type: application/json" \\
|
||||||
|
-d '{
|
||||||
|
"url": "${url}",
|
||||||
|
"query": "${query}",
|
||||||
|
"model_name": "${model}"
|
||||||
|
}'`;
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="request-card">
|
||||||
|
<div class="request-header">
|
||||||
|
<div class="request-info">
|
||||||
|
<div class="request-url">${url}</div>
|
||||||
|
<div class="request-query">${query}</div>
|
||||||
|
</div>
|
||||||
|
<div class="request-actions">
|
||||||
|
<button class="btn-danger" onclick="deleteSavedRequest('${request.id}')">
|
||||||
|
<i class="fas fa-trash"></i>
|
||||||
|
Delete
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="request-curl">
|
||||||
|
<h4>cURL Command:</h4>
|
||||||
|
<pre>${curlCommand}</pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete Saved Request
|
||||||
|
async function deleteSavedRequest(requestId) {
|
||||||
|
if (!confirm('Are you sure you want to delete this saved request?')) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(`${API_BASE_URL}/saved-requests/${requestId}`, {
|
||||||
|
method: 'DELETE'
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await response.json();
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
showToast('Saved request deleted successfully!', 'success');
|
||||||
|
loadSavedRequests();
|
||||||
|
} else {
|
||||||
|
throw new Error(result.detail || 'Failed to delete saved request');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Delete saved request error:', error);
|
||||||
|
showToast(`Error: ${error.message}`, 'error');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
loadModelSelect();
|
||||||
|
|
||||||
|
// Check if API is available
|
||||||
|
fetch(`${API_BASE_URL}/health`)
|
||||||
|
.then(response => {
|
||||||
|
if (!response.ok) {
|
||||||
|
showToast('Warning: API server might not be running', 'error');
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(() => {
|
||||||
|
showToast('Warning: Cannot connect to API server. Make sure it\'s running on localhost:8000', 'error');
|
||||||
|
});
|
||||||
|
});
|
||||||
765
docs/examples/website-to-api/static/styles.css
Normal file
765
docs/examples/website-to-api/static/styles.css
Normal file
@@ -0,0 +1,765 @@
|
|||||||
|
/* Reset and Base Styles */
|
||||||
|
* {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||||
|
background: #000000;
|
||||||
|
color: #FFFFFF;
|
||||||
|
line-height: 1.6;
|
||||||
|
font-size: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Header */
|
||||||
|
.header {
|
||||||
|
border-bottom: 1px solid #333;
|
||||||
|
padding: 1rem 0;
|
||||||
|
background: #000000;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 100;
|
||||||
|
}
|
||||||
|
|
||||||
|
.header-content {
|
||||||
|
max-width: 1200px;
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: 0 2rem;
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logo {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
font-size: 1.5rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
.logo-image {
|
||||||
|
width: 40px;
|
||||||
|
height: 40px;
|
||||||
|
border-radius: 4px;
|
||||||
|
object-fit: contain;
|
||||||
|
}
|
||||||
|
|
||||||
|
.nav-links {
|
||||||
|
display: flex;
|
||||||
|
gap: 2rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.nav-link {
|
||||||
|
color: #CCCCCC;
|
||||||
|
text-decoration: none;
|
||||||
|
font-weight: 500;
|
||||||
|
transition: color 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.nav-link:hover,
|
||||||
|
.nav-link.active {
|
||||||
|
color: #FFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Main Content */
|
||||||
|
.main-content {
|
||||||
|
max-width: 1200px;
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: 2rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.page {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.page.active {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Hero Section */
|
||||||
|
.hero-section {
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 4rem;
|
||||||
|
padding: 2rem 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hero-title {
|
||||||
|
font-size: 3rem;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #FFFFFF;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
line-height: 1.2;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hero-subtitle {
|
||||||
|
font-size: 1.25rem;
|
||||||
|
color: #CCCCCC;
|
||||||
|
max-width: 600px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Workflow Demo */
|
||||||
|
.workflow-demo {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 1fr auto 1fr;
|
||||||
|
gap: 2rem;
|
||||||
|
align-items: start;
|
||||||
|
margin-bottom: 4rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.workflow-step {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.step-title {
|
||||||
|
font-size: 1.25rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.workflow-arrow {
|
||||||
|
font-size: 2rem;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #09b5a5;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
margin-top: 20rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Request Box */
|
||||||
|
.request-box {
|
||||||
|
border: 2px solid #333;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 2rem;
|
||||||
|
background: #111111;
|
||||||
|
}
|
||||||
|
|
||||||
|
.input-group {
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.input-group label {
|
||||||
|
display: block;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.input-group input,
|
||||||
|
.input-group textarea,
|
||||||
|
.input-group select {
|
||||||
|
width: 100%;
|
||||||
|
padding: 0.75rem;
|
||||||
|
border: 1px solid #333;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
background: #1A1A1A;
|
||||||
|
color: #FFFFFF;
|
||||||
|
transition: border-color 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.input-group input:focus,
|
||||||
|
.input-group textarea:focus,
|
||||||
|
.input-group select:focus {
|
||||||
|
outline: none;
|
||||||
|
border-color: #09b5a5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.input-group textarea {
|
||||||
|
min-height: 80px;
|
||||||
|
resize: vertical;
|
||||||
|
}
|
||||||
|
|
||||||
|
.form-options {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 1fr 1fr;
|
||||||
|
gap: 1rem;
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.option-group {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.option-group label {
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.option-group input[type="checkbox"] {
|
||||||
|
width: auto;
|
||||||
|
margin-right: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.extract-btn {
|
||||||
|
width: 100%;
|
||||||
|
padding: 1rem;
|
||||||
|
background: #09b5a5;
|
||||||
|
color: #000000;
|
||||||
|
border: none;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 1rem;
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background-color 0.2s ease;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.extract-btn:hover {
|
||||||
|
background: #09b5a5;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Dropdown specific styling */
|
||||||
|
select,
|
||||||
|
.input-group select,
|
||||||
|
.option-group select {
|
||||||
|
cursor: pointer !important;
|
||||||
|
appearance: none !important;
|
||||||
|
-webkit-appearance: none !important;
|
||||||
|
-moz-appearance: none !important;
|
||||||
|
-ms-appearance: none !important;
|
||||||
|
background-image: url("data:image/svg+xml;charset=UTF-8,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='none' stroke='%23FFFFFF' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3e%3cpolyline points='6,9 12,15 18,9'%3e%3c/polyline%3e%3c/svg%3e") !important;
|
||||||
|
background-repeat: no-repeat !important;
|
||||||
|
background-position: right 0.75rem center !important;
|
||||||
|
background-size: 1rem !important;
|
||||||
|
padding-right: 2.5rem !important;
|
||||||
|
border: 1px solid #333 !important;
|
||||||
|
border-radius: 4px !important;
|
||||||
|
font-family: 'Courier New', monospace !important;
|
||||||
|
font-size: 0.9rem !important;
|
||||||
|
background-color: #1A1A1A !important;
|
||||||
|
color: #FFFFFF !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
select:hover,
|
||||||
|
.input-group select:hover,
|
||||||
|
.option-group select:hover {
|
||||||
|
border-color: #09b5a5 !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
select:focus,
|
||||||
|
.input-group select:focus,
|
||||||
|
.option-group select:focus {
|
||||||
|
outline: none !important;
|
||||||
|
border-color: #09b5a5 !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
select option,
|
||||||
|
.input-group select option,
|
||||||
|
.option-group select option {
|
||||||
|
background: #1A1A1A !important;
|
||||||
|
color: #FFFFFF !important;
|
||||||
|
padding: 0.5rem !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Response Container */
|
||||||
|
.response-container {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.api-request-box,
|
||||||
|
.json-response-box {
|
||||||
|
border: 2px solid #333;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 1.5rem;
|
||||||
|
background: #111111;
|
||||||
|
}
|
||||||
|
|
||||||
|
.api-request-box label,
|
||||||
|
.json-response-box label {
|
||||||
|
display: block;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.api-request-box pre,
|
||||||
|
.json-response-box pre {
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-size: 0.85rem;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: #FFFFFF;
|
||||||
|
background: #1A1A1A;
|
||||||
|
padding: 1rem;
|
||||||
|
border-radius: 4px;
|
||||||
|
overflow-x: auto;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Results Section */
|
||||||
|
.results-section {
|
||||||
|
border: 2px solid #333;
|
||||||
|
border-radius: 8px;
|
||||||
|
overflow: hidden;
|
||||||
|
margin-top: 2rem;
|
||||||
|
background: #111111;
|
||||||
|
}
|
||||||
|
|
||||||
|
.results-header {
|
||||||
|
background: #1A1A1A;
|
||||||
|
color: #FFFFFF;
|
||||||
|
padding: 1rem 1.5rem;
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
border-bottom: 1px solid #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.results-header h2 {
|
||||||
|
font-size: 1.25rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
.copy-btn {
|
||||||
|
background: #09b5a5;
|
||||||
|
color: #000000;
|
||||||
|
border: none;
|
||||||
|
padding: 0.5rem 1rem;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: pointer;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
transition: background-color 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.copy-btn:hover {
|
||||||
|
background: #09b5a5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.results-content {
|
||||||
|
padding: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.result-info {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||||
|
gap: 1rem;
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
padding: 1rem;
|
||||||
|
background: #1A1A1A;
|
||||||
|
border-radius: 4px;
|
||||||
|
border: 1px solid #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-item {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 0.25rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-item .label {
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.info-item .value {
|
||||||
|
color: #CCCCCC;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
.json-display {
|
||||||
|
background: #1A1A1A;
|
||||||
|
border-radius: 4px;
|
||||||
|
overflow: hidden;
|
||||||
|
border: 1px solid #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.json-display pre {
|
||||||
|
color: #FFFFFF;
|
||||||
|
padding: 1.5rem;
|
||||||
|
margin: 0;
|
||||||
|
overflow-x: auto;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
line-height: 1.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Loading State */
|
||||||
|
.loading {
|
||||||
|
text-align: center;
|
||||||
|
padding: 3rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.spinner {
|
||||||
|
width: 40px;
|
||||||
|
height: 40px;
|
||||||
|
border: 3px solid #333;
|
||||||
|
border-top: 3px solid #09b5a5;
|
||||||
|
border-radius: 50%;
|
||||||
|
animation: spin 1s linear infinite;
|
||||||
|
margin: 0 auto 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes spin {
|
||||||
|
0% { transform: rotate(0deg); }
|
||||||
|
100% { transform: rotate(360deg); }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Models Page */
|
||||||
|
.models-header {
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 3rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.models-header h1 {
|
||||||
|
font-size: 2.5rem;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #FFFFFF;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.models-header p {
|
||||||
|
font-size: 1.1rem;
|
||||||
|
color: #CCCCCC;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* API Requests Page */
|
||||||
|
.requests-header {
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 3rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.requests-header h1 {
|
||||||
|
font-size: 2.5rem;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #FFFFFF;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.requests-header p {
|
||||||
|
font-size: 1.1rem;
|
||||||
|
color: #CCCCCC;
|
||||||
|
}
|
||||||
|
|
||||||
|
.requests-container {
|
||||||
|
max-width: 1200px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.requests-list {
|
||||||
|
display: grid;
|
||||||
|
gap: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-card {
|
||||||
|
border: 2px solid #333;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 1.5rem;
|
||||||
|
background: #111111;
|
||||||
|
transition: border-color 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-card:hover {
|
||||||
|
border-color: #09b5a5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-header {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
padding-bottom: 1rem;
|
||||||
|
border-bottom: 1px solid #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-info {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-url {
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #09b5a5;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-query {
|
||||||
|
color: #CCCCCC;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
margin-top: 0.5rem;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-actions {
|
||||||
|
display: flex;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-curl {
|
||||||
|
background: #1A1A1A;
|
||||||
|
border: 1px solid #333;
|
||||||
|
border-radius: 4px;
|
||||||
|
padding: 1rem;
|
||||||
|
margin-top: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-curl h4 {
|
||||||
|
color: #FFFFFF;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
font-weight: 600;
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
}
|
||||||
|
|
||||||
|
.request-curl pre {
|
||||||
|
color: #CCCCCC;
|
||||||
|
font-size: 0.8rem;
|
||||||
|
line-height: 1.4;
|
||||||
|
overflow-x: auto;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-break: break-all;
|
||||||
|
background: #111111;
|
||||||
|
padding: 0.75rem;
|
||||||
|
border-radius: 4px;
|
||||||
|
border: 1px solid #333;
|
||||||
|
}
|
||||||
|
|
||||||
|
.models-container {
|
||||||
|
max-width: 800px;
|
||||||
|
margin: 0 auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-form-section {
|
||||||
|
border: 2px solid #333;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 2rem;
|
||||||
|
margin-bottom: 2rem;
|
||||||
|
background: #111111;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-form-section h3 {
|
||||||
|
font-size: 1.25rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-form {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.form-row {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 1fr 1fr;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.save-btn {
|
||||||
|
padding: 1rem;
|
||||||
|
background: #09b5a5;
|
||||||
|
color: #000000;
|
||||||
|
border: none;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 1rem;
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background-color 0.2s ease;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.save-btn:hover {
|
||||||
|
background: #09b5a5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.saved-models-section h3 {
|
||||||
|
font-size: 1.25rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
margin-bottom: 1.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.models-list {
|
||||||
|
display: grid;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-card {
|
||||||
|
border: 2px solid #333;
|
||||||
|
border-radius: 8px;
|
||||||
|
padding: 1.5rem;
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
transition: border-color 0.2s ease;
|
||||||
|
background: #111111;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-card:hover {
|
||||||
|
border-color: #09b5a5;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-info {
|
||||||
|
flex: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-name {
|
||||||
|
font-weight: 600;
|
||||||
|
color: #FFFFFF;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-provider {
|
||||||
|
color: #CCCCCC;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-actions {
|
||||||
|
display: flex;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.btn-danger {
|
||||||
|
background: #FF4444;
|
||||||
|
color: #FFFFFF;
|
||||||
|
border: none;
|
||||||
|
padding: 0.5rem 1rem;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.9rem;
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background-color 0.2s ease;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.btn-danger:hover {
|
||||||
|
background: #CC3333;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* Toast Notifications */
|
||||||
|
.toast-container {
|
||||||
|
position: fixed;
|
||||||
|
top: 20px;
|
||||||
|
right: 20px;
|
||||||
|
z-index: 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
.toast {
|
||||||
|
background: #111111;
|
||||||
|
border: 2px solid #333;
|
||||||
|
border-radius: 4px;
|
||||||
|
padding: 1rem 1.5rem;
|
||||||
|
margin-bottom: 0.5rem;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
animation: slideIn 0.3s ease;
|
||||||
|
max-width: 400px;
|
||||||
|
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
|
||||||
|
color: #FFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
.toast.success {
|
||||||
|
border-color: #09b5a5;
|
||||||
|
background: #0A1A1A;
|
||||||
|
}
|
||||||
|
|
||||||
|
.toast.error {
|
||||||
|
border-color: #FF4444;
|
||||||
|
background: #1A0A0A;
|
||||||
|
}
|
||||||
|
|
||||||
|
.toast.info {
|
||||||
|
border-color: #09b5a5;
|
||||||
|
background: #0A1A1A;
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes slideIn {
|
||||||
|
from {
|
||||||
|
transform: translateX(100%);
|
||||||
|
opacity: 0;
|
||||||
|
}
|
||||||
|
to {
|
||||||
|
transform: translateX(0);
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Responsive Design */
|
||||||
|
@media (max-width: 768px) {
|
||||||
|
.header-content {
|
||||||
|
padding: 0 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.main-content {
|
||||||
|
padding: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.hero-title {
|
||||||
|
font-size: 2rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.workflow-demo {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.workflow-arrow {
|
||||||
|
transform: rotate(90deg);
|
||||||
|
margin: 1rem 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.form-options {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
|
||||||
|
.form-row {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
|
||||||
|
.result-info {
|
||||||
|
grid-template-columns: 1fr;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-card {
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 1rem;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.model-actions {
|
||||||
|
width: 100%;
|
||||||
|
justify-content: center;
|
||||||
|
}
|
||||||
|
}
|
||||||
28
docs/examples/website-to-api/test_api.py
Normal file
28
docs/examples/website-to-api/test_api.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import asyncio
|
||||||
|
from web_scraper_lib import scrape_website
|
||||||
|
import os
|
||||||
|
|
||||||
|
async def test_library():
|
||||||
|
"""Test the mini library directly."""
|
||||||
|
print("=== Testing Mini Library ===")
|
||||||
|
|
||||||
|
# Test 1: Scrape with a custom model
|
||||||
|
url = "https://marketplace.mainstreet.co.in/collections/adidas-yeezy/products/adidas-yeezy-boost-350-v2-yecheil-non-reflective"
|
||||||
|
query = "Extract the following data: Product name, Product price, Product description, Product size. DO NOT EXTRACT ANYTHING ELSE."
|
||||||
|
if os.path.exists("models"):
|
||||||
|
model_name = os.listdir("models")[0].split(".")[0]
|
||||||
|
else:
|
||||||
|
raise Exception("No models found in models directory")
|
||||||
|
|
||||||
|
print(f"Scraping: {url}")
|
||||||
|
print(f"Query: {query}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await scrape_website(url, query, model_name)
|
||||||
|
print("✅ Library test successful!")
|
||||||
|
print(f"Extracted data: {result['extracted_data']}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Library test failed: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_library())
|
||||||
67
docs/examples/website-to-api/test_models.py
Normal file
67
docs/examples/website-to-api/test_models.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for the new model management functionality.
|
||||||
|
This script demonstrates how to save and use custom model configurations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
# API base URL
|
||||||
|
BASE_URL = "http://localhost:8000"
|
||||||
|
|
||||||
|
def test_model_management():
|
||||||
|
"""Test the model management endpoints."""
|
||||||
|
|
||||||
|
print("=== Testing Model Management ===")
|
||||||
|
|
||||||
|
# 1. List current models
|
||||||
|
print("\n1. Listing current models:")
|
||||||
|
response = requests.get(f"{BASE_URL}/models")
|
||||||
|
print(f"Status: {response.status_code}")
|
||||||
|
print(f"Response: {json.dumps(response.json(), indent=2)}")
|
||||||
|
|
||||||
|
|
||||||
|
# 2. Save another model configuration (OpenAI example)
|
||||||
|
print("\n2. Saving OpenAI model configuration:")
|
||||||
|
openai_config = {
|
||||||
|
"model_name": "my-openai",
|
||||||
|
"provider": "openai",
|
||||||
|
"api_token": "your-openai-api-key-here"
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/models", json=openai_config)
|
||||||
|
print(f"Status: {response.status_code}")
|
||||||
|
print(f"Response: {json.dumps(response.json(), indent=2)}")
|
||||||
|
|
||||||
|
# 3. List models again to see the new ones
|
||||||
|
print("\n3. Listing models after adding new ones:")
|
||||||
|
response = requests.get(f"{BASE_URL}/models")
|
||||||
|
print(f"Status: {response.status_code}")
|
||||||
|
print(f"Response: {json.dumps(response.json(), indent=2)}")
|
||||||
|
|
||||||
|
# 4. Delete a model configuration
|
||||||
|
print("\n4. Deleting a model configuration:")
|
||||||
|
response = requests.delete(f"{BASE_URL}/models/my-openai")
|
||||||
|
print(f"Status: {response.status_code}")
|
||||||
|
print(f"Response: {json.dumps(response.json(), indent=2)}")
|
||||||
|
|
||||||
|
# 5. Final list of models
|
||||||
|
print("\n5. Final list of models:")
|
||||||
|
response = requests.get(f"{BASE_URL}/models")
|
||||||
|
print(f"Status: {response.status_code}")
|
||||||
|
print(f"Response: {json.dumps(response.json(), indent=2)}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Model Management Test Script")
|
||||||
|
print("Make sure the API server is running on http://localhost:8000")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
try:
|
||||||
|
test_model_management()
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
print("Error: Could not connect to the API server.")
|
||||||
|
print("Make sure the server is running with: python api_server.py")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
397
docs/examples/website-to-api/web_scraper_lib.py
Normal file
397
docs/examples/website-to-api/web_scraper_lib.py
Normal file
@@ -0,0 +1,397 @@
|
|||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
BrowserConfig,
|
||||||
|
CacheMode,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
LLMConfig,
|
||||||
|
JsonCssExtractionStrategy,
|
||||||
|
LLMExtractionStrategy
|
||||||
|
)
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import hashlib
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
from litellm import completion
|
||||||
|
|
||||||
|
class ModelConfig:
|
||||||
|
"""Configuration for LLM models."""
|
||||||
|
|
||||||
|
def __init__(self, provider: str, api_token: str):
|
||||||
|
self.provider = provider
|
||||||
|
self.api_token = api_token
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"provider": self.provider,
|
||||||
|
"api_token": self.api_token
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'ModelConfig':
|
||||||
|
return cls(
|
||||||
|
provider=data["provider"],
|
||||||
|
api_token=data["api_token"]
|
||||||
|
)
|
||||||
|
|
||||||
|
class WebScraperAgent:
|
||||||
|
"""
|
||||||
|
A mini library that converts any website into a structured data API.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
1. Provide a URL and tell AI what data you need in plain English
|
||||||
|
2. Generate: Agent reverse-engineers the site and deploys custom scraper
|
||||||
|
3. Integrate: Use private API endpoint to get structured data
|
||||||
|
4. Support for custom LLM models and API keys
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, schemas_dir: str = "schemas", models_dir: str = "models"):
|
||||||
|
self.schemas_dir = schemas_dir
|
||||||
|
self.models_dir = models_dir
|
||||||
|
os.makedirs(self.schemas_dir, exist_ok=True)
|
||||||
|
os.makedirs(self.models_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def _generate_schema_key(self, url: str, query: str) -> str:
|
||||||
|
"""Generate a unique key for schema caching based on URL and query."""
|
||||||
|
content = f"{url}:{query}"
|
||||||
|
return hashlib.md5(content.encode()).hexdigest()
|
||||||
|
|
||||||
|
def save_model_config(self, model_name: str, provider: str, api_token: str) -> bool:
|
||||||
|
"""
|
||||||
|
Save a model configuration for later use.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: User-friendly name for the model
|
||||||
|
provider: LLM provider (e.g., 'gemini', 'openai', 'anthropic')
|
||||||
|
api_token: API token for the provider
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if saved successfully
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
model_config = ModelConfig(provider, api_token)
|
||||||
|
config_path = os.path.join(self.models_dir, f"{model_name}.json")
|
||||||
|
|
||||||
|
with open(config_path, "w") as f:
|
||||||
|
json.dump(model_config.to_dict(), f, indent=2)
|
||||||
|
|
||||||
|
print(f"Model configuration saved: {model_name}")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to save model configuration: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def load_model_config(self, model_name: str) -> Optional[ModelConfig]:
|
||||||
|
"""
|
||||||
|
Load a saved model configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: Name of the saved model configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ModelConfig object or None if not found
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
config_path = os.path.join(self.models_dir, f"{model_name}.json")
|
||||||
|
if not os.path.exists(config_path):
|
||||||
|
return None
|
||||||
|
|
||||||
|
with open(config_path, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
return ModelConfig.from_dict(data)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to load model configuration: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def list_saved_models(self) -> List[str]:
|
||||||
|
"""List all saved model configurations."""
|
||||||
|
models = []
|
||||||
|
for filename in os.listdir(self.models_dir):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
models.append(filename[:-5]) # Remove .json extension
|
||||||
|
return models
|
||||||
|
|
||||||
|
def delete_model_config(self, model_name: str) -> bool:
|
||||||
|
"""
|
||||||
|
Delete a saved model configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: Name of the model configuration to delete
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if deleted successfully
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
config_path = os.path.join(self.models_dir, f"{model_name}.json")
|
||||||
|
if os.path.exists(config_path):
|
||||||
|
os.remove(config_path)
|
||||||
|
print(f"Model configuration deleted: {model_name}")
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to delete model configuration: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _load_or_generate_schema(self, url: str, query: str, session_id: str = "schema_generator", model_name: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Loads schema from cache if exists, otherwise generates using AI.
|
||||||
|
This is the "Generate" step - our agent reverse-engineers the site.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to scrape
|
||||||
|
query: Query for data extraction
|
||||||
|
session_id: Session identifier
|
||||||
|
model_name: Name of saved model configuration to use
|
||||||
|
"""
|
||||||
|
schema_key = self._generate_schema_key(url, query)
|
||||||
|
schema_path = os.path.join(self.schemas_dir, f"{schema_key}.json")
|
||||||
|
|
||||||
|
if os.path.exists(schema_path):
|
||||||
|
print(f"Schema found in cache for {url}")
|
||||||
|
with open(schema_path, "r") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
print(f"Generating new schema for {url}")
|
||||||
|
print(f"Query: {query}")
|
||||||
|
query += """
|
||||||
|
IMPORTANT:
|
||||||
|
GENERATE THE SCHEMA WITH ONLY THE FIELDS MENTIONED IN THE QUERY. MAKE SURE THE NUMBER OF FIELDS IN THE SCHEME MATCH THE NUMBER OF FIELDS IN THE QUERY.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Step 1: Fetch the page HTML
|
||||||
|
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
session_id=session_id,
|
||||||
|
simulate_user=True,
|
||||||
|
remove_overlay_elements=True,
|
||||||
|
delay_before_return_html=5,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
html = result.fit_html
|
||||||
|
|
||||||
|
# Step 2: Generate schema using AI with custom model if specified
|
||||||
|
print("AI is analyzing the page structure...")
|
||||||
|
|
||||||
|
# Use custom model configuration if provided
|
||||||
|
if model_name:
|
||||||
|
model_config = self.load_model_config(model_name)
|
||||||
|
if model_config:
|
||||||
|
llm_config = LLMConfig(
|
||||||
|
provider=model_config.provider,
|
||||||
|
api_token=model_config.api_token
|
||||||
|
)
|
||||||
|
print(f"Using custom model: {model_name}")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Model configuration '{model_name}' not found. Please add it from the Models page.")
|
||||||
|
else:
|
||||||
|
# Require a model to be specified
|
||||||
|
raise ValueError("No model specified. Please select a model from the dropdown or add one from the Models page.")
|
||||||
|
|
||||||
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
|
html=html,
|
||||||
|
llm_config=llm_config,
|
||||||
|
query=query
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 3: Cache the generated schema
|
||||||
|
print(f"Schema generated and cached: {json.dumps(schema, indent=2)}")
|
||||||
|
with open(schema_path, "w") as f:
|
||||||
|
json.dump(schema, f, indent=2)
|
||||||
|
|
||||||
|
return schema
|
||||||
|
|
||||||
|
def _generate_llm_schema(self, query: str, llm_config: LLMConfig) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Generate a schema for a given query using a custom LLM model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Plain English description of what data to extract
|
||||||
|
model_config: Model configuration to use
|
||||||
|
"""
|
||||||
|
# ask the model to generate a schema for the given query in the form of a json.
|
||||||
|
prompt = f"""
|
||||||
|
IDENTIFY THE FIELDS FOR EXTRACTION MENTIONED IN THE QUERY and GENERATE A JSON SCHEMA FOR THE FIELDS.
|
||||||
|
eg.
|
||||||
|
{{
|
||||||
|
"name": "str",
|
||||||
|
"age": "str",
|
||||||
|
"email": "str",
|
||||||
|
"product_name": "str",
|
||||||
|
"product_price": "str",
|
||||||
|
"product_description": "str",
|
||||||
|
"product_image": "str",
|
||||||
|
"product_url": "str",
|
||||||
|
"product_rating": "str",
|
||||||
|
"product_reviews": "str",
|
||||||
|
}}
|
||||||
|
Here is the query:
|
||||||
|
{query}
|
||||||
|
IMPORTANT:
|
||||||
|
THE RESULT SHOULD BE A JSON OBJECT.
|
||||||
|
MAKE SURE THE NUMBER OF FIELDS IN THE RESULT MATCH THE NUMBER OF FIELDS IN THE QUERY.
|
||||||
|
THE RESULT SHOULD BE A JSON OBJECT.
|
||||||
|
"""
|
||||||
|
response = completion(
|
||||||
|
model=llm_config.provider,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
api_key=llm_config.api_token,
|
||||||
|
result_type="json"
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.json()["choices"][0]["message"]["content"]
|
||||||
|
async def scrape_data_with_llm(self, url: str, query: str, model_name: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Scrape structured data from any website using a custom LLM model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The website URL to scrape
|
||||||
|
query: Plain English description of what data to extract
|
||||||
|
model_name: Name of saved model configuration to use
|
||||||
|
"""
|
||||||
|
|
||||||
|
if model_name:
|
||||||
|
model_config = self.load_model_config(model_name)
|
||||||
|
if model_config:
|
||||||
|
llm_config = LLMConfig(
|
||||||
|
provider=model_config.provider,
|
||||||
|
api_token=model_config.api_token
|
||||||
|
)
|
||||||
|
print(f"Using custom model: {model_name}")
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Model configuration '{model_name}' not found. Please add it from the Models page.")
|
||||||
|
else:
|
||||||
|
# Require a model to be specified
|
||||||
|
raise ValueError("No model specified. Please select a model from the dropdown or add one from the Models page.")
|
||||||
|
|
||||||
|
query += """\n
|
||||||
|
IMPORTANT:
|
||||||
|
THE RESULT SHOULD BE A JSON OBJECT WITH THE ONLY THE FIELDS MENTIONED IN THE QUERY.
|
||||||
|
MAKE SURE THE NUMBER OF FIELDS IN THE RESULT MATCH THE NUMBER OF FIELDS IN THE QUERY.
|
||||||
|
THE RESULT SHOULD BE A JSON OBJECT.
|
||||||
|
"""
|
||||||
|
|
||||||
|
schema = self._generate_llm_schema(query, llm_config)
|
||||||
|
|
||||||
|
print(f"Schema: {schema}")
|
||||||
|
|
||||||
|
llm_extraction_strategy = LLMExtractionStrategy(
|
||||||
|
llm_config=llm_config,
|
||||||
|
instruction=query,
|
||||||
|
result_type="json",
|
||||||
|
schema=schema
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
simulate_user=True,
|
||||||
|
extraction_strategy=llm_extraction_strategy,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
extracted_data = result.extracted_content
|
||||||
|
if isinstance(extracted_data, str):
|
||||||
|
try:
|
||||||
|
extracted_data = json.loads(extracted_data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# If it's not valid JSON, keep it as string
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"query": query,
|
||||||
|
"extracted_data": extracted_data,
|
||||||
|
"timestamp": result.timestamp if hasattr(result, 'timestamp') else None
|
||||||
|
}
|
||||||
|
|
||||||
|
async def scrape_data(self, url: str, query: str, model_name: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Main method to scrape structured data from any website.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The website URL to scrape
|
||||||
|
query: Plain English description of what data to extract
|
||||||
|
model_name: Name of saved model configuration to use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Structured data extracted from the website
|
||||||
|
"""
|
||||||
|
# Step 1: Generate or load schema (reverse-engineer the site)
|
||||||
|
schema = await self._load_or_generate_schema(url=url, query=query, model_name=model_name)
|
||||||
|
|
||||||
|
# Step 2: Deploy custom high-speed scraper
|
||||||
|
print(f"Deploying custom scraper for {url}")
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
extraction_strategy=JsonCssExtractionStrategy(schema=schema),
|
||||||
|
)
|
||||||
|
result = await crawler.arun(url=url, config=run_config)
|
||||||
|
|
||||||
|
# Step 3: Return structured data
|
||||||
|
# Parse extracted_content if it's a JSON string
|
||||||
|
extracted_data = result.extracted_content
|
||||||
|
if isinstance(extracted_data, str):
|
||||||
|
try:
|
||||||
|
extracted_data = json.loads(extracted_data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# If it's not valid JSON, keep it as string
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"query": query,
|
||||||
|
"extracted_data": extracted_data,
|
||||||
|
"schema_used": schema,
|
||||||
|
"timestamp": result.timestamp if hasattr(result, 'timestamp') else None
|
||||||
|
}
|
||||||
|
|
||||||
|
async def get_cached_schemas(self) -> Dict[str, str]:
|
||||||
|
"""Get list of cached schemas."""
|
||||||
|
schemas = {}
|
||||||
|
for filename in os.listdir(self.schemas_dir):
|
||||||
|
if filename.endswith('.json'):
|
||||||
|
schema_key = filename[:-5] # Remove .json extension
|
||||||
|
schemas[schema_key] = filename
|
||||||
|
return schemas
|
||||||
|
|
||||||
|
def clear_cache(self):
|
||||||
|
"""Clear all cached schemas."""
|
||||||
|
import shutil
|
||||||
|
if os.path.exists(self.schemas_dir):
|
||||||
|
shutil.rmtree(self.schemas_dir)
|
||||||
|
os.makedirs(self.schemas_dir, exist_ok=True)
|
||||||
|
print("Schema cache cleared")
|
||||||
|
|
||||||
|
# Convenience function for simple usage
|
||||||
|
async def scrape_website(url: str, query: str, model_name: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Simple function to scrape any website with plain English instructions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Website URL
|
||||||
|
query: Plain English description of what data to extract
|
||||||
|
model_name: Name of saved model configuration to use
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Extracted structured data
|
||||||
|
"""
|
||||||
|
agent = WebScraperAgent()
|
||||||
|
return await agent.scrape_data(url, query, model_name)
|
||||||
|
|
||||||
|
async def scrape_website_with_llm(url: str, query: str, model_name: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Scrape structured data from any website using a custom LLM model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The website URL to scrape
|
||||||
|
query: Plain English description of what data to extract
|
||||||
|
model_name: Name of saved model configuration to use
|
||||||
|
"""
|
||||||
|
agent = WebScraperAgent()
|
||||||
|
return await agent.scrape_data_with_llm(url, query, model_name)
|
||||||
@@ -7,13 +7,13 @@ Simple proxy configuration with `BrowserConfig`:
|
|||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import BrowserConfig
|
from crawl4ai.async_configs import BrowserConfig
|
||||||
|
|
||||||
# Using proxy URL
|
# Using HTTP proxy
|
||||||
browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
|
browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
|
||||||
# Using SOCKS proxy
|
# Using SOCKS proxy
|
||||||
browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
|
browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
```
|
```
|
||||||
@@ -25,7 +25,11 @@ Use an authenticated proxy with `BrowserConfig`:
|
|||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import BrowserConfig
|
from crawl4ai.async_configs import BrowserConfig
|
||||||
|
|
||||||
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
|
browser_config = BrowserConfig(proxy_config={
|
||||||
|
"server": "http://[host]:[port]",
|
||||||
|
"username": "[username]",
|
||||||
|
"password": "[password]",
|
||||||
|
})
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ browser_cfg = BrowserConfig(
|
|||||||
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
||||||
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
||||||
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
||||||
| **`proxy`** | `str` (default: `None`) | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`. |
|
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
||||||
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
|
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
|
||||||
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
||||||
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
||||||
@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
|
|||||||
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
||||||
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
||||||
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
||||||
|
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
|
||||||
|
|
||||||
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
||||||
|
|
||||||
|
|||||||
@@ -108,7 +108,19 @@ config = AdaptiveConfig(
|
|||||||
embedding_min_confidence_threshold=0.1 # Stop if completely irrelevant
|
embedding_min_confidence_threshold=0.1 # Stop if completely irrelevant
|
||||||
)
|
)
|
||||||
|
|
||||||
# With custom embedding provider (e.g., OpenAI)
|
# With custom LLM provider for query expansion (recommended)
|
||||||
|
from crawl4ai import LLMConfig
|
||||||
|
|
||||||
|
config = AdaptiveConfig(
|
||||||
|
strategy="embedding",
|
||||||
|
embedding_llm_config=LLMConfig(
|
||||||
|
provider='openai/text-embedding-3-small',
|
||||||
|
api_token='your-api-key',
|
||||||
|
temperature=0.7
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Alternative: Dictionary format (backward compatible)
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
strategy="embedding",
|
strategy="embedding",
|
||||||
embedding_llm_config={
|
embedding_llm_config={
|
||||||
|
|||||||
@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
|
|||||||
|
|
||||||
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
||||||
|
|
||||||
|
6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
|
||||||
|
|
||||||
|
```python
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
|
||||||
|
preserve_https_for_internal_links=True # Keep HTTPS even if server redirects to HTTP
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 10. Summary & Next Steps
|
## 10. Summary & Next Steps
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ name = "Crawl4AI"
|
|||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">=3.9"
|
requires-python = ">=3.10"
|
||||||
license = "Apache-2.0"
|
license = "Apache-2.0"
|
||||||
authors = [
|
authors = [
|
||||||
{name = "Unclecode", email = "unclecode@kidocode.com"}
|
{name = "Unclecode", email = "unclecode@kidocode.com"}
|
||||||
@@ -36,6 +36,7 @@ dependencies = [
|
|||||||
"PyYAML>=6.0",
|
"PyYAML>=6.0",
|
||||||
"nltk>=3.9.1",
|
"nltk>=3.9.1",
|
||||||
"rich>=13.9.4",
|
"rich>=13.9.4",
|
||||||
|
"cssselect>=1.2.0",
|
||||||
"httpx>=0.27.2",
|
"httpx>=0.27.2",
|
||||||
"httpx[http2]>=0.27.2",
|
"httpx[http2]>=0.27.2",
|
||||||
"fake-useragent>=2.0.3",
|
"fake-useragent>=2.0.3",
|
||||||
@@ -51,7 +52,6 @@ classifiers = [
|
|||||||
"Development Status :: 4 - Beta",
|
"Development Status :: 4 - Beta",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.12",
|
"Programming Language :: Python :: 3.12",
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ psutil>=6.1.1
|
|||||||
PyYAML>=6.0
|
PyYAML>=6.0
|
||||||
nltk>=3.9.1
|
nltk>=3.9.1
|
||||||
rich>=13.9.4
|
rich>=13.9.4
|
||||||
|
cssselect>=1.2.0
|
||||||
chardet>=5.2.0
|
chardet>=5.2.0
|
||||||
brotli>=1.1.0
|
brotli>=1.1.0
|
||||||
httpx[http2]>=0.27.2
|
httpx[http2]>=0.27.2
|
||||||
|
|||||||
3
setup.py
3
setup.py
@@ -56,11 +56,10 @@ setup(
|
|||||||
"Development Status :: 3 - Alpha",
|
"Development Status :: 3 - Alpha",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
"Programming Language :: Python :: 3.11",
|
"Programming Language :: Python :: 3.11",
|
||||||
"Programming Language :: Python :: 3.12",
|
"Programming Language :: Python :: 3.12",
|
||||||
"Programming Language :: Python :: 3.13",
|
"Programming Language :: Python :: 3.13",
|
||||||
],
|
],
|
||||||
python_requires=">=3.9",
|
python_requires=">=3.10",
|
||||||
)
|
)
|
||||||
|
|||||||
154
tests/adaptive/test_llm_embedding.py
Normal file
154
tests/adaptive/test_llm_embedding.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
||||||
|
"""Test a specific configuration"""
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Configuration: {name}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
|
result = await adaptive.digest(start_url=url, query=query)
|
||||||
|
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("CRAWL STATISTICS")
|
||||||
|
print("="*50)
|
||||||
|
adaptive.print_stats(detailed=False)
|
||||||
|
|
||||||
|
# Get the most relevant content found
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("MOST RELEVANT PAGES")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||||
|
for i, page in enumerate(relevant_pages, 1):
|
||||||
|
print(f"\n{i}. {page['url']}")
|
||||||
|
print(f" Relevance Score: {page['score']:.2%}")
|
||||||
|
|
||||||
|
# Show a snippet of the content
|
||||||
|
content = page['content'] or ""
|
||||||
|
if content:
|
||||||
|
snippet = content[:200].replace('\n', ' ')
|
||||||
|
if len(content) > 200:
|
||||||
|
snippet += "..."
|
||||||
|
print(f" Preview: {snippet}")
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"Pages crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||||
|
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
||||||
|
|
||||||
|
if result.metrics.get('is_irrelevant', False):
|
||||||
|
print("⚠️ Query detected as irrelevant!")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def llm_embedding():
|
||||||
|
"""Demonstrate various embedding configurations"""
|
||||||
|
|
||||||
|
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Base URL and query for testing
|
||||||
|
test_url = "https://docs.python.org/3/library/asyncio.html"
|
||||||
|
|
||||||
|
openai_llm_config = LLMConfig(
|
||||||
|
provider='openai/text-embedding-3-small',
|
||||||
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
|
temperature=0.7,
|
||||||
|
max_tokens=2000
|
||||||
|
)
|
||||||
|
config_openai = AdaptiveConfig(
|
||||||
|
strategy="embedding",
|
||||||
|
max_pages=10,
|
||||||
|
|
||||||
|
# Use OpenAI embeddings
|
||||||
|
embedding_llm_config=openai_llm_config,
|
||||||
|
# embedding_llm_config={
|
||||||
|
# 'provider': 'openai/text-embedding-3-small',
|
||||||
|
# 'api_token': os.getenv('OPENAI_API_KEY')
|
||||||
|
# },
|
||||||
|
|
||||||
|
# OpenAI embeddings are high quality, can be stricter
|
||||||
|
embedding_k_exp=4.0,
|
||||||
|
n_query_variations=12
|
||||||
|
)
|
||||||
|
|
||||||
|
await test_configuration(
|
||||||
|
"OpenAI Embeddings",
|
||||||
|
config_openai,
|
||||||
|
test_url,
|
||||||
|
# "event-driven architecture patterns"
|
||||||
|
"async await context managers coroutines"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def basic_adaptive_crawling():
|
||||||
|
"""Basic adaptive crawling example"""
|
||||||
|
|
||||||
|
# Initialize the crawler
|
||||||
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
|
# Create an adaptive crawler with default settings (statistical strategy)
|
||||||
|
adaptive = AdaptiveCrawler(crawler)
|
||||||
|
|
||||||
|
# Note: You can also use embedding strategy for semantic understanding:
|
||||||
|
# from crawl4ai import AdaptiveConfig
|
||||||
|
# config = AdaptiveConfig(strategy="embedding")
|
||||||
|
# adaptive = AdaptiveCrawler(crawler, config)
|
||||||
|
|
||||||
|
# Start adaptive crawling
|
||||||
|
print("Starting adaptive crawl for Python async programming information...")
|
||||||
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||||
|
query="async await context managers coroutines"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display crawl statistics
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("CRAWL STATISTICS")
|
||||||
|
print("="*50)
|
||||||
|
adaptive.print_stats(detailed=False)
|
||||||
|
|
||||||
|
# Get the most relevant content found
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("MOST RELEVANT PAGES")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||||
|
for i, page in enumerate(relevant_pages, 1):
|
||||||
|
print(f"\n{i}. {page['url']}")
|
||||||
|
print(f" Relevance Score: {page['score']:.2%}")
|
||||||
|
|
||||||
|
# Show a snippet of the content
|
||||||
|
content = page['content'] or ""
|
||||||
|
if content:
|
||||||
|
snippet = content[:200].replace('\n', ' ')
|
||||||
|
if len(content) > 200:
|
||||||
|
snippet += "..."
|
||||||
|
print(f" Preview: {snippet}")
|
||||||
|
|
||||||
|
# Show final confidence
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
||||||
|
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
|
||||||
|
if adaptive.confidence >= 0.8:
|
||||||
|
print("✓ High confidence - can answer detailed questions about async Python")
|
||||||
|
elif adaptive.confidence >= 0.6:
|
||||||
|
print("~ Moderate confidence - can answer basic questions")
|
||||||
|
else:
|
||||||
|
print("✗ Low confidence - need more information")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(llm_embedding())
|
||||||
|
# asyncio.run(basic_adaptive_crawling())
|
||||||
@@ -112,7 +112,7 @@ async def test_proxy_settings():
|
|||||||
headless=True,
|
headless=True,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
||||||
proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
|
proxy_config={"server": "http://127.0.0.1:8080"}, # Assuming local proxy server for test
|
||||||
use_managed_browser=False,
|
use_managed_browser=False,
|
||||||
use_persistent_context=False,
|
use_persistent_context=False,
|
||||||
) as crawler:
|
) as crawler:
|
||||||
|
|||||||
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
|
|||||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
||||||
# It might be null, missing, or populated depending on the server's default behavior
|
# It might be null, missing, or populated depending on the server's default behavior
|
||||||
|
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test that /crawl endpoint handles stream=True directly without redirect."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"headless": True,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": True, # Set stream to True for direct streaming
|
||||||
|
"screenshot": False,
|
||||||
|
"cache_mode": CacheMode.BYPASS.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send a request to the /crawl endpoint - should handle streaming directly
|
||||||
|
async with async_client.stream("POST", "/crawl", json=payload) as response:
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers["content-type"] == "application/x-ndjson"
|
||||||
|
assert response.headers.get("x-stream-status") == "active"
|
||||||
|
|
||||||
|
results = await process_streaming_response(response)
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
result = results[0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["url"] == SIMPLE_HTML_URL
|
||||||
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
||||||
"""Test /crawl/stream with a single URL and simple config values."""
|
"""Test /crawl/stream with a single URL and simple config values."""
|
||||||
payload = {
|
payload = {
|
||||||
@@ -635,7 +668,209 @@ class TestCrawlEndpoints:
|
|||||||
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
||||||
except Exception as e: # Catch any other unexpected error
|
except Exception as e: # Catch any other unexpected error
|
||||||
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
|
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
|
||||||
|
|
||||||
|
|
||||||
|
# 7. Error Handling Tests
|
||||||
|
async def test_invalid_url_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for invalid URLs."""
|
||||||
|
payload = {
|
||||||
|
"urls": ["invalid-url", "https://nonexistent-domain-12345.com"],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": CacheMode.BYPASS.value}}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
# Should return 200 with failed results, not 500
|
||||||
|
print(f"Status code: {response.status_code}")
|
||||||
|
print(f"Response: {response.text}")
|
||||||
|
assert response.status_code == 500
|
||||||
|
data = response.json()
|
||||||
|
assert data["detail"].startswith("Crawl request failed:")
|
||||||
|
|
||||||
|
async def test_mixed_success_failure_urls(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test handling of mixed success/failure URLs."""
|
||||||
|
payload = {
|
||||||
|
"urls": [
|
||||||
|
SIMPLE_HTML_URL, # Should succeed
|
||||||
|
"https://nonexistent-domain-12345.com", # Should fail
|
||||||
|
"https://invalid-url-with-special-chars-!@#$%^&*()", # Should fail
|
||||||
|
],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"cache_mode": CacheMode.BYPASS.value,
|
||||||
|
"markdown_generator": {
|
||||||
|
"type": "DefaultMarkdownGenerator",
|
||||||
|
"params": {
|
||||||
|
"content_filter": {
|
||||||
|
"type": "PruningContentFilter",
|
||||||
|
"params": {"threshold": 0.5}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) == 3
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
failure_count = 0
|
||||||
|
|
||||||
|
for result in data["results"]:
|
||||||
|
if result["success"]:
|
||||||
|
success_count += 1
|
||||||
|
else:
|
||||||
|
failure_count += 1
|
||||||
|
assert "error_message" in result
|
||||||
|
assert len(result["error_message"]) > 0
|
||||||
|
|
||||||
|
assert success_count >= 1 # At least one should succeed
|
||||||
|
assert failure_count >= 1 # At least one should fail
|
||||||
|
|
||||||
|
async def test_streaming_mixed_urls(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test streaming with mixed success/failure URLs."""
|
||||||
|
payload = {
|
||||||
|
"urls": [
|
||||||
|
SIMPLE_HTML_URL, # Should succeed
|
||||||
|
"https://nonexistent-domain-12345.com", # Should fail
|
||||||
|
],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": True,
|
||||||
|
"cache_mode": CacheMode.BYPASS.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
results = await process_streaming_response(response)
|
||||||
|
|
||||||
|
assert len(results) == 2
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
failure_count = 0
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
if result["success"]:
|
||||||
|
success_count += 1
|
||||||
|
assert result["url"] == SIMPLE_HTML_URL
|
||||||
|
else:
|
||||||
|
failure_count += 1
|
||||||
|
assert "error_message" in result
|
||||||
|
assert result["error_message"] is not None
|
||||||
|
|
||||||
|
assert success_count == 1
|
||||||
|
assert failure_count == 1
|
||||||
|
|
||||||
|
async def test_markdown_endpoint_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for markdown endpoint."""
|
||||||
|
# Test invalid URL
|
||||||
|
invalid_payload = {"url": "invalid-url", "f": "fit"}
|
||||||
|
response = await async_client.post("/md", json=invalid_payload)
|
||||||
|
# Should return 400 for invalid URL format
|
||||||
|
assert response.status_code == 400
|
||||||
|
|
||||||
|
# Test non-existent URL
|
||||||
|
nonexistent_payload = {"url": "https://nonexistent-domain-12345.com", "f": "fit"}
|
||||||
|
response = await async_client.post("/md", json=nonexistent_payload)
|
||||||
|
# Should return 500 for crawl failure
|
||||||
|
assert response.status_code == 500
|
||||||
|
|
||||||
|
async def test_html_endpoint_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for HTML endpoint."""
|
||||||
|
# Test invalid URL
|
||||||
|
invalid_payload = {"url": "invalid-url"}
|
||||||
|
response = await async_client.post("/html", json=invalid_payload)
|
||||||
|
# Should return 500 for crawl failure
|
||||||
|
assert response.status_code == 500
|
||||||
|
|
||||||
|
async def test_screenshot_endpoint_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for screenshot endpoint."""
|
||||||
|
# Test invalid URL
|
||||||
|
invalid_payload = {"url": "invalid-url"}
|
||||||
|
response = await async_client.post("/screenshot", json=invalid_payload)
|
||||||
|
# Should return 500 for crawl failure
|
||||||
|
assert response.status_code == 500
|
||||||
|
|
||||||
|
async def test_pdf_endpoint_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for PDF endpoint."""
|
||||||
|
# Test invalid URL
|
||||||
|
invalid_payload = {"url": "invalid-url"}
|
||||||
|
response = await async_client.post("/pdf", json=invalid_payload)
|
||||||
|
# Should return 500 for crawl failure
|
||||||
|
assert response.status_code == 500
|
||||||
|
|
||||||
|
async def test_execute_js_endpoint_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for execute_js endpoint."""
|
||||||
|
# Test invalid URL
|
||||||
|
invalid_payload = {"url": "invalid-url", "scripts": ["return document.title;"]}
|
||||||
|
response = await async_client.post("/execute_js", json=invalid_payload)
|
||||||
|
# Should return 500 for crawl failure
|
||||||
|
assert response.status_code == 500
|
||||||
|
|
||||||
|
async def test_llm_endpoint_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for LLM endpoint."""
|
||||||
|
# Test missing query parameter
|
||||||
|
response = await async_client.get("/llm/https://example.com")
|
||||||
|
assert response.status_code == 422 # FastAPI validation error, not 400
|
||||||
|
|
||||||
|
# Test invalid URL
|
||||||
|
response = await async_client.get("/llm/invalid-url?q=test")
|
||||||
|
# Should return 500 for crawl failure
|
||||||
|
assert response.status_code == 500
|
||||||
|
|
||||||
|
async def test_ask_endpoint_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for ask endpoint."""
|
||||||
|
# Test invalid context_type
|
||||||
|
response = await async_client.get("/ask?context_type=invalid")
|
||||||
|
assert response.status_code == 422 # Validation error
|
||||||
|
|
||||||
|
# Test invalid score_ratio
|
||||||
|
response = await async_client.get("/ask?score_ratio=2.0") # > 1.0
|
||||||
|
assert response.status_code == 422 # Validation error
|
||||||
|
|
||||||
|
# Test invalid max_results
|
||||||
|
response = await async_client.get("/ask?max_results=0") # < 1
|
||||||
|
assert response.status_code == 422 # Validation error
|
||||||
|
|
||||||
|
async def test_config_dump_error_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test error handling for config dump endpoint."""
|
||||||
|
# Test invalid code
|
||||||
|
invalid_payload = {"code": "invalid_code"}
|
||||||
|
response = await async_client.post("/config/dump", json=invalid_payload)
|
||||||
|
assert response.status_code == 400
|
||||||
|
|
||||||
|
# Test nested function calls (not allowed)
|
||||||
|
nested_payload = {"code": "CrawlerRunConfig(BrowserConfig())"}
|
||||||
|
response = await async_client.post("/config/dump", json=nested_payload)
|
||||||
|
assert response.status_code == 400
|
||||||
|
|
||||||
|
async def test_malformed_request_handling(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test handling of malformed requests."""
|
||||||
|
# Test missing required fields
|
||||||
|
malformed_payload = {"urls": []} # Missing browser_config and crawler_config
|
||||||
|
response = await async_client.post("/crawl", json=malformed_payload)
|
||||||
|
print(f"Response: {response.text}")
|
||||||
|
assert response.status_code == 422 # Validation error
|
||||||
|
|
||||||
|
# Test empty URLs list
|
||||||
|
empty_urls_payload = {
|
||||||
|
"urls": [],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {}},
|
||||||
|
"crawler_config": {"type": "CrawlerRunConfig", "params": {}}
|
||||||
|
}
|
||||||
|
response = await async_client.post("/crawl", json=empty_urls_payload)
|
||||||
|
assert response.status_code == 422 # "At least one URL required"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Define arguments for pytest programmatically
|
# Define arguments for pytest programmatically
|
||||||
# -v: verbose output
|
# -v: verbose output
|
||||||
|
|||||||
117
tests/general/test_bff_scoring.py
Normal file
117
tests/general/test_bff_scoring.py
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple test to verify BestFirstCrawlingStrategy fixes.
|
||||||
|
This test crawls a real website and shows that:
|
||||||
|
1. Higher-scoring pages are crawled first (priority queue fix)
|
||||||
|
2. Links are scored before truncation (link discovery fix)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||||||
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||||
|
|
||||||
|
async def test_best_first_strategy():
|
||||||
|
"""Test BestFirstCrawlingStrategy with keyword scoring"""
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("Testing BestFirstCrawlingStrategy with Real URL")
|
||||||
|
print("=" * 70)
|
||||||
|
print("\nThis test will:")
|
||||||
|
print("1. Crawl Python.org documentation")
|
||||||
|
print("2. Score pages based on keywords: 'tutorial', 'guide', 'reference'")
|
||||||
|
print("3. Show that higher-scoring pages are crawled first")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
# Create a keyword scorer that prioritizes tutorial/guide pages
|
||||||
|
scorer = KeywordRelevanceScorer(
|
||||||
|
keywords=["tutorial", "guide", "reference", "documentation"],
|
||||||
|
weight=1.0,
|
||||||
|
case_sensitive=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the strategy with scoring
|
||||||
|
strategy = BestFirstCrawlingStrategy(
|
||||||
|
max_depth=2, # Crawl 2 levels deep
|
||||||
|
max_pages=10, # Limit to 10 pages total
|
||||||
|
url_scorer=scorer, # Use keyword scoring
|
||||||
|
include_external=False # Only internal links
|
||||||
|
)
|
||||||
|
|
||||||
|
# Configure browser and crawler
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True, # Run in background
|
||||||
|
verbose=False # Reduce output noise
|
||||||
|
)
|
||||||
|
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=strategy,
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nStarting crawl of https://docs.python.org/3/")
|
||||||
|
print("Looking for pages with keywords: tutorial, guide, reference, documentation")
|
||||||
|
print("-" * 70)
|
||||||
|
|
||||||
|
crawled_urls = []
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# Crawl and collect results
|
||||||
|
results = await crawler.arun(
|
||||||
|
url="https://docs.python.org/3/",
|
||||||
|
config=crawler_config
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
if isinstance(results, list):
|
||||||
|
for result in results:
|
||||||
|
score = result.metadata.get('score', 0) if result.metadata else 0
|
||||||
|
depth = result.metadata.get('depth', 0) if result.metadata else 0
|
||||||
|
crawled_urls.append({
|
||||||
|
'url': result.url,
|
||||||
|
'score': score,
|
||||||
|
'depth': depth,
|
||||||
|
'success': result.success
|
||||||
|
})
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("CRAWL RESULTS (in order of crawling)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for i, item in enumerate(crawled_urls, 1):
|
||||||
|
status = "✓" if item['success'] else "✗"
|
||||||
|
# Highlight high-scoring pages
|
||||||
|
if item['score'] > 0.5:
|
||||||
|
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
|
||||||
|
print(f" ^ HIGH SCORE - Contains keywords!")
|
||||||
|
else:
|
||||||
|
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("ANALYSIS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Check if higher scores appear early in the crawl
|
||||||
|
scores = [item['score'] for item in crawled_urls[1:]] # Skip initial URL
|
||||||
|
high_score_indices = [i for i, s in enumerate(scores) if s > 0.3]
|
||||||
|
|
||||||
|
if high_score_indices and high_score_indices[0] < len(scores) / 2:
|
||||||
|
print("✅ SUCCESS: Higher-scoring pages (with keywords) were crawled early!")
|
||||||
|
print(" This confirms the priority queue fix is working.")
|
||||||
|
else:
|
||||||
|
print("⚠️ Check the crawl order above - higher scores should appear early")
|
||||||
|
|
||||||
|
# Show score distribution
|
||||||
|
print(f"\nScore Statistics:")
|
||||||
|
print(f" - Total pages crawled: {len(crawled_urls)}")
|
||||||
|
print(f" - Average score: {sum(item['score'] for item in crawled_urls) / len(crawled_urls):.2f}")
|
||||||
|
print(f" - Max score: {max(item['score'] for item in crawled_urls):.2f}")
|
||||||
|
print(f" - Pages with keywords: {sum(1 for item in crawled_urls if item['score'] > 0.3)}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("TEST COMPLETE")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("\n🔍 BestFirstCrawlingStrategy Simple Test\n")
|
||||||
|
asyncio.run(test_best_first_strategy())
|
||||||
@@ -24,7 +24,7 @@ CASES = [
|
|||||||
# --- BrowserConfig variants ---
|
# --- BrowserConfig variants ---
|
||||||
"BrowserConfig()",
|
"BrowserConfig()",
|
||||||
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
|
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
|
||||||
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
|
"BrowserConfig(browser_mode='builtin', proxy_config={'server': 'http://1.2.3.4:8080'})",
|
||||||
]
|
]
|
||||||
|
|
||||||
for code in CASES:
|
for code in CASES:
|
||||||
|
|||||||
42
tests/proxy/test_proxy_deprecation.py
Normal file
42
tests/proxy/test_proxy_deprecation.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import warnings
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, ProxyConfig
|
||||||
|
|
||||||
|
|
||||||
|
def test_browser_config_proxy_string_emits_deprecation_and_autoconverts():
|
||||||
|
warnings.simplefilter("always", DeprecationWarning)
|
||||||
|
|
||||||
|
proxy_str = "23.95.150.145:6114:username:password"
|
||||||
|
with warnings.catch_warnings(record=True) as caught:
|
||||||
|
cfg = BrowserConfig(proxy=proxy_str, headless=True)
|
||||||
|
|
||||||
|
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
|
||||||
|
assert dep_warnings, "Expected DeprecationWarning when using BrowserConfig(proxy=...)"
|
||||||
|
|
||||||
|
assert cfg.proxy is None, "cfg.proxy should be None after auto-conversion"
|
||||||
|
assert isinstance(cfg.proxy_config, ProxyConfig), "cfg.proxy_config should be ProxyConfig instance"
|
||||||
|
assert cfg.proxy_config.username == "username"
|
||||||
|
assert cfg.proxy_config.password == "password"
|
||||||
|
assert cfg.proxy_config.server.startswith("http://")
|
||||||
|
assert cfg.proxy_config.server.endswith(":6114")
|
||||||
|
|
||||||
|
|
||||||
|
def test_browser_config_with_proxy_config_emits_no_deprecation():
|
||||||
|
warnings.simplefilter("always", DeprecationWarning)
|
||||||
|
|
||||||
|
with warnings.catch_warnings(record=True) as caught:
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
proxy_config={
|
||||||
|
"server": "http://127.0.0.1:8080",
|
||||||
|
"username": "u",
|
||||||
|
"password": "p",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
|
||||||
|
assert not dep_warnings, "Did not expect DeprecationWarning when using proxy_config"
|
||||||
|
assert cfg.proxy is None
|
||||||
|
assert isinstance(cfg.proxy_config, ProxyConfig)
|
||||||
175
tests/test_preserve_https_for_internal_links.py
Normal file
175
tests/test_preserve_https_for_internal_links.py
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Final test and demo for HTTPS preservation feature (Issue #1410)
|
||||||
|
|
||||||
|
This demonstrates how the preserve_https_for_internal_links flag
|
||||||
|
prevents HTTPS downgrade when servers redirect to HTTP.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
def demonstrate_issue():
|
||||||
|
"""Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("DEMONSTRATING THE ISSUE")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Simulate what happens during crawling
|
||||||
|
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||||
|
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP
|
||||||
|
|
||||||
|
# Extract a relative link
|
||||||
|
relative_link = "/author/Albert-Einstein"
|
||||||
|
|
||||||
|
# Standard URL joining uses the redirected (HTTP) base
|
||||||
|
resolved_url = urljoin(redirected_url, relative_link)
|
||||||
|
|
||||||
|
print(f"Original URL: {original_url}")
|
||||||
|
print(f"Redirected to: {redirected_url}")
|
||||||
|
print(f"Relative link: {relative_link}")
|
||||||
|
print(f"Resolved link: {resolved_url}")
|
||||||
|
print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
|
||||||
|
|
||||||
|
return resolved_url
|
||||||
|
|
||||||
|
def demonstrate_solution():
|
||||||
|
"""Show the solution: preserve HTTPS for internal links"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("DEMONSTRATING THE SOLUTION")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Our normalize_url with HTTPS preservation
|
||||||
|
def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
|
"""Normalize URL with optional HTTPS preservation"""
|
||||||
|
|
||||||
|
# Standard resolution
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
|
||||||
|
# Only for same-domain links
|
||||||
|
if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
print(f" → Preserved HTTPS for {parsed_full.netloc}")
|
||||||
|
|
||||||
|
return full_url
|
||||||
|
|
||||||
|
# Same scenario as before
|
||||||
|
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||||
|
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
|
||||||
|
relative_link = "/author/Albert-Einstein"
|
||||||
|
|
||||||
|
# Without preservation (current behavior)
|
||||||
|
resolved_without = normalize_url_with_preservation(
|
||||||
|
relative_link, redirected_url,
|
||||||
|
preserve_https=False, original_scheme='https'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nWithout preservation:")
|
||||||
|
print(f" Result: {resolved_without}")
|
||||||
|
|
||||||
|
# With preservation (new feature)
|
||||||
|
resolved_with = normalize_url_with_preservation(
|
||||||
|
relative_link, redirected_url,
|
||||||
|
preserve_https=True, original_scheme='https'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nWith preservation (preserve_https_for_internal_links=True):")
|
||||||
|
print(f" Result: {resolved_with}")
|
||||||
|
print(f"\n✅ Solution: Internal link stays HTTPS!")
|
||||||
|
|
||||||
|
return resolved_with
|
||||||
|
|
||||||
|
def test_edge_cases():
|
||||||
|
"""Test important edge cases"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("EDGE CASES")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
def preserve_https(href, base_url, original_scheme):
|
||||||
|
"""Helper to test preservation logic"""
|
||||||
|
full_url = urljoin(base_url, href)
|
||||||
|
|
||||||
|
if original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Fixed: check for protocol-relative URLs
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
|
return full_url
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
# (description, href, base_url, original_scheme, should_be_https)
|
||||||
|
("External link", "http://other.com/page", "http://example.com", "https", False),
|
||||||
|
("Already HTTPS", "/page", "https://example.com", "https", True),
|
||||||
|
("No original HTTPS", "/page", "http://example.com", "http", False),
|
||||||
|
("Subdomain", "/page", "http://sub.example.com", "https", True),
|
||||||
|
("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
|
||||||
|
]
|
||||||
|
|
||||||
|
for desc, href, base_url, orig_scheme, should_be_https in test_cases:
|
||||||
|
result = preserve_https(href, base_url, orig_scheme)
|
||||||
|
is_https = result.startswith('https://')
|
||||||
|
status = "✅" if is_https == should_be_https else "❌"
|
||||||
|
|
||||||
|
print(f"\n{status} {desc}:")
|
||||||
|
print(f" Input: {href} + {base_url}")
|
||||||
|
print(f" Result: {result}")
|
||||||
|
print(f" Expected HTTPS: {should_be_https}, Got: {is_https}")
|
||||||
|
|
||||||
|
def usage_example():
|
||||||
|
"""Show how to use the feature in crawl4ai"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("USAGE IN CRAWL4AI")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print("""
|
||||||
|
To enable HTTPS preservation in your crawl4ai code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
preserve_https_for_internal_links=True # Enable HTTPS preservation
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
# All internal links will maintain HTTPS even if
|
||||||
|
# the server redirects to HTTP
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful for:
|
||||||
|
- Sites that redirect HTTPS to HTTP but still support HTTPS
|
||||||
|
- Security-conscious crawling where you want to stay on HTTPS
|
||||||
|
- Avoiding mixed content issues in downstream processing
|
||||||
|
""")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run all demonstrations
|
||||||
|
demonstrate_issue()
|
||||||
|
demonstrate_solution()
|
||||||
|
test_edge_cases()
|
||||||
|
usage_example()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✅ All tests complete!")
|
||||||
|
print("=" * 60)
|
||||||
849
tests/test_url_normalization_comprehensive.py
Normal file
849
tests/test_url_normalization_comprehensive.py
Normal file
@@ -0,0 +1,849 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Comprehensive test suite for URL normalization functions in utils.py
|
||||||
|
Tests all scenarios and edge cases for the updated normalize_url functions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urljoin, urlparse, urlunparse, parse_qsl, urlencode
|
||||||
|
|
||||||
|
# Add the crawl4ai package to the path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
# Import only the specific functions we need to test
|
||||||
|
from crawl4ai.utils import get_base_domain, is_external_url
|
||||||
|
|
||||||
|
|
||||||
|
# ANSI Color codes for beautiful console output
|
||||||
|
class Colors:
|
||||||
|
# Basic colors
|
||||||
|
RED = '\033[91m'
|
||||||
|
GREEN = '\033[92m'
|
||||||
|
YELLOW = '\033[93m'
|
||||||
|
BLUE = '\033[94m'
|
||||||
|
MAGENTA = '\033[95m'
|
||||||
|
CYAN = '\033[96m'
|
||||||
|
WHITE = '\033[97m'
|
||||||
|
|
||||||
|
# Bright colors
|
||||||
|
BRIGHT_RED = '\033[91;1m'
|
||||||
|
BRIGHT_GREEN = '\033[92;1m'
|
||||||
|
BRIGHT_YELLOW = '\033[93;1m'
|
||||||
|
BRIGHT_BLUE = '\033[94;1m'
|
||||||
|
BRIGHT_MAGENTA = '\033[95;1m'
|
||||||
|
BRIGHT_CYAN = '\033[96;1m'
|
||||||
|
BRIGHT_WHITE = '\033[97;1m'
|
||||||
|
|
||||||
|
# Background colors
|
||||||
|
BG_RED = '\033[41m'
|
||||||
|
BG_GREEN = '\033[42m'
|
||||||
|
BG_YELLOW = '\033[43m'
|
||||||
|
BG_BLUE = '\033[44m'
|
||||||
|
|
||||||
|
# Text styles
|
||||||
|
BOLD = '\033[1m'
|
||||||
|
UNDERLINE = '\033[4m'
|
||||||
|
RESET = '\033[0m'
|
||||||
|
|
||||||
|
# Icons
|
||||||
|
CHECK = '✓'
|
||||||
|
CROSS = '✗'
|
||||||
|
WARNING = '⚠'
|
||||||
|
INFO = 'ℹ'
|
||||||
|
STAR = '⭐'
|
||||||
|
FIRE = '🔥'
|
||||||
|
ROCKET = '🚀'
|
||||||
|
TARGET = '🎯'
|
||||||
|
|
||||||
|
|
||||||
|
def colorize(text, color):
|
||||||
|
"""Apply color to text"""
|
||||||
|
return f"{color}{text}{Colors.RESET}"
|
||||||
|
|
||||||
|
|
||||||
|
def print_header(title, icon=""):
|
||||||
|
"""Print a formatted header"""
|
||||||
|
width = 80
|
||||||
|
print(f"\n{Colors.BG_BLUE}{Colors.WHITE}{Colors.BOLD}{'=' * width}{Colors.RESET}")
|
||||||
|
if icon:
|
||||||
|
print(f"{Colors.BG_BLUE}{Colors.WHITE}{Colors.BOLD}{' ' * ((width - len(title) - len(icon) - 1) // 2)}{icon} {title}{' ' * ((width - len(title) - len(icon) - 1) // 2)}{Colors.RESET}")
|
||||||
|
else:
|
||||||
|
print(f"{Colors.BG_BLUE}{Colors.WHITE}{Colors.BOLD}{' ' * ((width - len(title)) // 2)}{title}{' ' * ((width - len(title)) // 2)}{Colors.RESET}")
|
||||||
|
print(f"{Colors.BG_BLUE}{Colors.WHITE}{Colors.BOLD}{'=' * width}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_section(title, icon=""):
|
||||||
|
"""Print a formatted section header"""
|
||||||
|
if icon:
|
||||||
|
print(f"\n{Colors.CYAN}{Colors.BOLD}{icon} {title}{Colors.RESET}")
|
||||||
|
else:
|
||||||
|
print(f"\n{Colors.CYAN}{Colors.BOLD}{title}{Colors.RESET}")
|
||||||
|
print(f"{Colors.CYAN}{'-' * (len(title) + (len(icon) + 1 if icon else 0))}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_success(message):
|
||||||
|
"""Print success message"""
|
||||||
|
print(f"{Colors.GREEN}{Colors.CHECK} {message}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_error(message):
|
||||||
|
"""Print error message"""
|
||||||
|
print(f"{Colors.RED}{Colors.CROSS} {message}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_warning(message):
|
||||||
|
"""Print warning message"""
|
||||||
|
print(f"{Colors.YELLOW}{Colors.WARNING} {message}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_info(message):
|
||||||
|
"""Print info message"""
|
||||||
|
print(f"{Colors.BLUE}{Colors.INFO} {message}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_test_result(test_name, passed, expected=None, actual=None):
|
||||||
|
"""Print formatted test result"""
|
||||||
|
if passed:
|
||||||
|
print(f" {Colors.GREEN}{Colors.CHECK} {test_name}{Colors.RESET}")
|
||||||
|
else:
|
||||||
|
print(f" {Colors.RED}{Colors.CROSS} {test_name}{Colors.RESET}")
|
||||||
|
if expected is not None and actual is not None:
|
||||||
|
print(f" {Colors.BRIGHT_RED}Expected: {expected}{Colors.RESET}")
|
||||||
|
print(f" {Colors.BRIGHT_RED}Actual: {actual}{Colors.RESET}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_progress(current, total, test_name=""):
|
||||||
|
"""Print progress indicator"""
|
||||||
|
percentage = (current / total) * 100
|
||||||
|
bar_length = 40
|
||||||
|
filled_length = int(bar_length * current // total)
|
||||||
|
bar = '█' * filled_length + '░' * (bar_length - filled_length)
|
||||||
|
|
||||||
|
sys.stdout.write(f'\r{Colors.CYAN}Progress: [{bar}] {percentage:.1f}% ({current}/{total}) {test_name}{Colors.RESET}')
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
if current == total:
|
||||||
|
print() # New line when complete
|
||||||
|
|
||||||
|
# Copy the normalize_url functions directly to avoid import issues
|
||||||
|
def normalize_url(
|
||||||
|
href: str,
|
||||||
|
base_url: str,
|
||||||
|
*,
|
||||||
|
drop_query_tracking=True,
|
||||||
|
sort_query=True,
|
||||||
|
keep_fragment=False,
|
||||||
|
extra_drop_params=None,
|
||||||
|
preserve_https=False,
|
||||||
|
original_scheme=None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Extended URL normalizer with fixes for edge cases - copied from utils.py for testing
|
||||||
|
"""
|
||||||
|
if not href or not href.strip():
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Resolve relative paths first
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
|
# Parse once, edit parts, then rebuild
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# ── netloc ──
|
||||||
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Remove default ports
|
||||||
|
if ':' in netloc:
|
||||||
|
host, port = netloc.rsplit(':', 1)
|
||||||
|
if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'):
|
||||||
|
netloc = host
|
||||||
|
else:
|
||||||
|
netloc = f"{host}:{port}"
|
||||||
|
|
||||||
|
# ── path ──
|
||||||
|
# Strip duplicate slashes and trailing "/" (except root)
|
||||||
|
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
|
||||||
|
# The path from urlparse is already properly encoded
|
||||||
|
path = parsed.path
|
||||||
|
if path.endswith('/') and path != '/':
|
||||||
|
path = path.rstrip('/')
|
||||||
|
|
||||||
|
# ── query ──
|
||||||
|
query = parsed.query
|
||||||
|
if query:
|
||||||
|
# explode, mutate, then rebuild
|
||||||
|
params = list(parse_qsl(query, keep_blank_values=True)) # Parse query string into key-value pairs, preserving blank values
|
||||||
|
|
||||||
|
if drop_query_tracking:
|
||||||
|
# Define default tracking parameters to remove for cleaner URLs
|
||||||
|
default_tracking = {
|
||||||
|
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term',
|
||||||
|
'utm_content', 'gclid', 'fbclid', 'ref', 'ref_src'
|
||||||
|
}
|
||||||
|
if extra_drop_params:
|
||||||
|
default_tracking |= {p.lower() for p in extra_drop_params} # Add any extra parameters to drop, case-insensitive
|
||||||
|
params = [(k, v) for k, v in params if k not in default_tracking] # Filter out tracking parameters
|
||||||
|
|
||||||
|
# Normalize parameter keys to lowercase
|
||||||
|
params = [(k.lower(), v) for k, v in params]
|
||||||
|
|
||||||
|
if sort_query:
|
||||||
|
params.sort(key=lambda kv: kv[0]) # Sort parameters alphabetically by key (now lowercase)
|
||||||
|
|
||||||
|
query = urlencode(params, doseq=True) if params else '' # Rebuild query string, handling sequences properly
|
||||||
|
|
||||||
|
# ── fragment ──
|
||||||
|
fragment = parsed.fragment if keep_fragment else ''
|
||||||
|
|
||||||
|
# Re-assemble
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
netloc,
|
||||||
|
path,
|
||||||
|
parsed.params,
|
||||||
|
query,
|
||||||
|
fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
|
"""Normalize URLs for deep crawling - copied from utils.py for testing"""
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Use urljoin to handle relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
|
# Parse the URL for normalization
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Convert hostname to lowercase
|
||||||
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Remove fragment entirely
|
||||||
|
fragment = ''
|
||||||
|
|
||||||
|
# Normalize query parameters if needed
|
||||||
|
query = parsed.query
|
||||||
|
if query:
|
||||||
|
# Parse query parameters
|
||||||
|
params = parse_qsl(query)
|
||||||
|
|
||||||
|
# Remove tracking parameters (example - customize as needed)
|
||||||
|
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
|
||||||
|
params = [(k, v) for k, v in params if k not in tracking_params]
|
||||||
|
|
||||||
|
# Rebuild query string, sorted for consistency
|
||||||
|
query = urlencode(params, doseq=True) if params else ''
|
||||||
|
|
||||||
|
# Build normalized URL
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
netloc,
|
||||||
|
parsed.path.rstrip('/'), # Normalize trailing slash
|
||||||
|
parsed.params,
|
||||||
|
query,
|
||||||
|
fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
|
"""Efficient URL normalization with proper parsing - copied from utils.py for testing"""
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Resolve relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
|
# Use proper URL parsing
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Only perform the most critical normalizations
|
||||||
|
# 1. Lowercase hostname
|
||||||
|
# 2. Remove fragment
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
parsed.netloc.lower(),
|
||||||
|
parsed.path.rstrip('/'),
|
||||||
|
parsed.params,
|
||||||
|
parsed.query,
|
||||||
|
'' # Remove fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
class URLNormalizationTestSuite:
|
||||||
|
"""Comprehensive test suite for URL normalization functions"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.base_url = "https://example.com/path/page.html"
|
||||||
|
self.https_base_url = "https://example.com/path/page.html"
|
||||||
|
self.http_base_url = "http://example.com/path/page.html"
|
||||||
|
self.tests_run = 0
|
||||||
|
self.tests_passed = 0
|
||||||
|
self.tests_failed = []
|
||||||
|
self.test_start_time = None
|
||||||
|
self.section_stats = {}
|
||||||
|
self.current_section = None
|
||||||
|
|
||||||
|
def start_section(self, section_name, icon=""):
|
||||||
|
"""Start a new test section"""
|
||||||
|
self.current_section = section_name
|
||||||
|
if section_name not in self.section_stats:
|
||||||
|
self.section_stats[section_name] = {'run': 0, 'passed': 0, 'failed': 0}
|
||||||
|
print_section(section_name, icon)
|
||||||
|
|
||||||
|
def assert_equal(self, actual, expected, test_name):
|
||||||
|
"""Assert that actual equals expected"""
|
||||||
|
self.tests_run += 1
|
||||||
|
if self.current_section:
|
||||||
|
self.section_stats[self.current_section]['run'] += 1
|
||||||
|
|
||||||
|
if actual == expected:
|
||||||
|
self.tests_passed += 1
|
||||||
|
if self.current_section:
|
||||||
|
self.section_stats[self.current_section]['passed'] += 1
|
||||||
|
print_test_result(test_name, True)
|
||||||
|
else:
|
||||||
|
self.tests_failed.append({
|
||||||
|
'name': test_name,
|
||||||
|
'expected': expected,
|
||||||
|
'actual': actual,
|
||||||
|
'section': self.current_section
|
||||||
|
})
|
||||||
|
if self.current_section:
|
||||||
|
self.section_stats[self.current_section]['failed'] += 1
|
||||||
|
print_test_result(test_name, False, expected, actual)
|
||||||
|
|
||||||
|
def assert_none(self, actual, test_name):
|
||||||
|
"""Assert that actual is None"""
|
||||||
|
self.assert_equal(actual, None, test_name)
|
||||||
|
|
||||||
|
def test_basic_url_resolution(self):
|
||||||
|
"""Test basic relative and absolute URL resolution"""
|
||||||
|
self.start_section("Basic URL Resolution", Colors.TARGET)
|
||||||
|
|
||||||
|
# Absolute URLs should remain unchanged
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://other.com/page.html", self.base_url),
|
||||||
|
"https://other.com/page.html",
|
||||||
|
"Absolute URL unchanged"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Relative URLs
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("relative.html", self.base_url),
|
||||||
|
"https://example.com/path/relative.html",
|
||||||
|
"Relative URL resolution"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("./relative.html", self.base_url),
|
||||||
|
"https://example.com/path/relative.html",
|
||||||
|
"Relative URL with dot"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("../relative.html", self.base_url),
|
||||||
|
"https://example.com/relative.html",
|
||||||
|
"Parent directory resolution"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Root-relative URLs
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("/root.html", self.base_url),
|
||||||
|
"https://example.com/root.html",
|
||||||
|
"Root-relative URL"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Protocol-relative URLs
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("//cdn.example.com/asset.js", self.base_url),
|
||||||
|
"https://cdn.example.com/asset.js",
|
||||||
|
"Protocol-relative URL"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_query_parameter_handling(self):
|
||||||
|
"""Test query parameter sorting and tracking removal"""
|
||||||
|
self.start_section("Query Parameter Handling", Colors.STAR)
|
||||||
|
|
||||||
|
# Basic query parameters
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?page=1&sort=name", self.base_url),
|
||||||
|
"https://example.com?page=1&sort=name",
|
||||||
|
"Basic query parameters sorted"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Tracking parameters removal
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?utm_source=google&utm_medium=email&page=1", self.base_url),
|
||||||
|
"https://example.com?page=1",
|
||||||
|
"Tracking parameters removed"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mixed tracking and valid parameters
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?fbclid=123&utm_campaign=test&category=news&id=456", self.base_url),
|
||||||
|
"https://example.com?category=news&id=456",
|
||||||
|
"Mixed tracking and valid parameters"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Empty query values
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?page=&sort=name", self.base_url),
|
||||||
|
"https://example.com?page=&sort=name",
|
||||||
|
"Empty query values preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Disable tracking removal
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?utm_source=google&page=1", self.base_url, drop_query_tracking=False),
|
||||||
|
"https://example.com?page=1&utm_source=google",
|
||||||
|
"Tracking parameters preserved when disabled"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Disable sorting
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?z=1&a=2", self.base_url, sort_query=False),
|
||||||
|
"https://example.com?z=1&a=2",
|
||||||
|
"Query parameters not sorted when disabled"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_fragment_handling(self):
|
||||||
|
"""Test fragment/hash handling"""
|
||||||
|
self.start_section("Fragment Handling", Colors.FIRE)
|
||||||
|
|
||||||
|
# Fragments removed by default
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/page.html#section", self.base_url),
|
||||||
|
"https://example.com/page.html",
|
||||||
|
"Fragment removed by default"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fragments preserved when requested
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/page.html#section", self.base_url, keep_fragment=True),
|
||||||
|
"https://example.com/page.html#section",
|
||||||
|
"Fragment preserved when requested"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fragments with query parameters
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?page=1#section", self.base_url, keep_fragment=True),
|
||||||
|
"https://example.com?page=1#section",
|
||||||
|
"Fragment with query parameters"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_https_preservation(self):
|
||||||
|
"""Test HTTPS preservation logic"""
|
||||||
|
self.start_section("HTTPS Preservation", Colors.ROCKET)
|
||||||
|
|
||||||
|
# Same domain HTTP to HTTPS
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("http://example.com/page.html", self.https_base_url, preserve_https=True, original_scheme='https'),
|
||||||
|
"https://example.com/page.html",
|
||||||
|
"HTTP to HTTPS for same domain"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Different domain should not change
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("http://other.com/page.html", self.https_base_url, preserve_https=True, original_scheme='https'),
|
||||||
|
"http://other.com/page.html",
|
||||||
|
"Different domain HTTP unchanged"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Protocol-relative should follow base
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("//example.com/page.html", self.https_base_url, preserve_https=True, original_scheme='https'),
|
||||||
|
"https://example.com/page.html",
|
||||||
|
"Protocol-relative follows base scheme"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_edge_cases(self):
|
||||||
|
"""Test edge cases and error conditions"""
|
||||||
|
self.start_section("Edge Cases", Colors.WARNING)
|
||||||
|
|
||||||
|
# None and empty inputs
|
||||||
|
result = normalize_url(None, self.base_url) # type: ignore
|
||||||
|
self.assert_none(result, "None input")
|
||||||
|
|
||||||
|
self.assert_none(normalize_url("", self.base_url), "Empty string input")
|
||||||
|
self.assert_none(normalize_url(" ", self.base_url), "Whitespace only input")
|
||||||
|
|
||||||
|
# Malformed URLs
|
||||||
|
try:
|
||||||
|
normalize_url("not-a-url", "invalid-base")
|
||||||
|
print("✗ Should have raised ValueError for invalid base URL")
|
||||||
|
except ValueError:
|
||||||
|
print("✓ Correctly raised ValueError for invalid base URL")
|
||||||
|
|
||||||
|
# Special protocols
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("mailto:test@example.com", self.base_url),
|
||||||
|
"mailto:test@example.com",
|
||||||
|
"Mailto protocol preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("tel:+1234567890", self.base_url),
|
||||||
|
"tel:+1234567890",
|
||||||
|
"Tel protocol preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("javascript:void(0)", self.base_url),
|
||||||
|
"javascript:void(0)",
|
||||||
|
"JavaScript protocol preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_case_sensitivity(self):
|
||||||
|
"""Test case sensitivity handling"""
|
||||||
|
self.start_section("Case Sensitivity", Colors.INFO)
|
||||||
|
|
||||||
|
# Domain case normalization
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://EXAMPLE.COM/page.html", self.base_url),
|
||||||
|
"https://example.com/page.html",
|
||||||
|
"Domain case normalization"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mixed case paths
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/PATH/Page.HTML", self.base_url),
|
||||||
|
"https://example.com/PATH/Page.HTML",
|
||||||
|
"Path case preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Query parameter case
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com?PARAM=value", self.base_url),
|
||||||
|
"https://example.com?param=value",
|
||||||
|
"Query parameter case normalization"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_unicode_and_special_chars(self):
|
||||||
|
"""Test Unicode and special characters"""
|
||||||
|
self.start_section("Unicode & Special Characters", "🌍")
|
||||||
|
|
||||||
|
# Unicode in path
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/café.html", self.base_url),
|
||||||
|
"https://example.com/café.html",
|
||||||
|
"Unicode characters in path"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Encoded characters
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/caf%C3%A9.html", self.base_url),
|
||||||
|
"https://example.com/caf%C3%A9.html",
|
||||||
|
"URL-encoded characters preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Spaces in URLs
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/page with spaces.html", self.base_url),
|
||||||
|
"https://example.com/page with spaces.html",
|
||||||
|
"Spaces in URLs handled"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_port_numbers(self):
|
||||||
|
"""Test port number handling"""
|
||||||
|
self.start_section("Port Numbers", "🔌")
|
||||||
|
|
||||||
|
# Default ports
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com:443/page.html", self.base_url),
|
||||||
|
"https://example.com/page.html",
|
||||||
|
"Default HTTPS port removed"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("http://example.com:80/page.html", self.base_url),
|
||||||
|
"http://example.com/page.html",
|
||||||
|
"Default HTTP port removed"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Non-default ports
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com:8443/page.html", self.base_url),
|
||||||
|
"https://example.com:8443/page.html",
|
||||||
|
"Non-default port preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_trailing_slashes(self):
|
||||||
|
"""Test trailing slash normalization"""
|
||||||
|
self.start_section("Trailing Slashes", "📁")
|
||||||
|
|
||||||
|
# Remove trailing slash from paths
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/path/", self.base_url),
|
||||||
|
"https://example.com/path",
|
||||||
|
"Trailing slash removed from path"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Preserve root trailing slash
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/", self.base_url),
|
||||||
|
"https://example.com/",
|
||||||
|
"Root trailing slash preserved"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Multiple trailing slashes
|
||||||
|
self.assert_equal(
|
||||||
|
normalize_url("https://example.com/path//", self.base_url),
|
||||||
|
"https://example.com/path",
|
||||||
|
"Multiple trailing slashes normalized"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_deep_crawl_functions(self):
|
||||||
|
"""Test deep crawl specific normalization functions"""
|
||||||
|
self.start_section("Deep Crawl Functions", "🔍")
|
||||||
|
|
||||||
|
# Test normalize_url_for_deep_crawl
|
||||||
|
result = normalize_url_for_deep_crawl("https://EXAMPLE.COM/path/?utm_source=test&page=1", self.base_url)
|
||||||
|
expected = "https://example.com/path?page=1"
|
||||||
|
self.assert_equal(result, expected, "Deep crawl normalization")
|
||||||
|
|
||||||
|
# Test efficient version
|
||||||
|
result = efficient_normalize_url_for_deep_crawl("https://EXAMPLE.COM/path/#fragment", self.base_url)
|
||||||
|
expected = "https://example.com/path"
|
||||||
|
self.assert_equal(result, expected, "Efficient deep crawl normalization")
|
||||||
|
|
||||||
|
def test_base_domain_extraction(self):
|
||||||
|
"""Test base domain extraction"""
|
||||||
|
self.start_section("Base Domain Extraction", "🏠")
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
get_base_domain("https://www.example.com/path"),
|
||||||
|
"example.com",
|
||||||
|
"WWW prefix removed"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
get_base_domain("https://sub.example.co.uk/path"),
|
||||||
|
"example.co.uk",
|
||||||
|
"Special TLD handled"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
get_base_domain("https://example.com:8080/path"),
|
||||||
|
"example.com",
|
||||||
|
"Port removed"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_external_url_detection(self):
|
||||||
|
"""Test external URL detection"""
|
||||||
|
self.start_section("External URL Detection", "🌐")
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
is_external_url("https://other.com/page.html", "example.com"),
|
||||||
|
True,
|
||||||
|
"Different domain is external"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
is_external_url("https://www.example.com/page.html", "example.com"),
|
||||||
|
False,
|
||||||
|
"Same domain with www is internal"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assert_equal(
|
||||||
|
is_external_url("mailto:test@example.com", "example.com"),
|
||||||
|
True,
|
||||||
|
"Special protocol is external"
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_all_tests(self):
|
||||||
|
"""Run all test suites"""
|
||||||
|
print_header("🚀 URL Normalization Test Suite", Colors.ROCKET)
|
||||||
|
self.test_start_time = time.time()
|
||||||
|
|
||||||
|
# Run all test sections
|
||||||
|
sections = [
|
||||||
|
("Basic URL Resolution", Colors.TARGET, self.test_basic_url_resolution),
|
||||||
|
("Query Parameter Handling", Colors.STAR, self.test_query_parameter_handling),
|
||||||
|
("Fragment Handling", Colors.FIRE, self.test_fragment_handling),
|
||||||
|
("HTTPS Preservation", Colors.ROCKET, self.test_https_preservation),
|
||||||
|
("Edge Cases", Colors.WARNING, self.test_edge_cases),
|
||||||
|
("Case Sensitivity", Colors.INFO, self.test_case_sensitivity),
|
||||||
|
("Unicode & Special Characters", "🌍", self.test_unicode_and_special_chars),
|
||||||
|
("Port Numbers", "🔌", self.test_port_numbers),
|
||||||
|
("Trailing Slashes", "📁", self.test_trailing_slashes),
|
||||||
|
("Deep Crawl Functions", "🔍", self.test_deep_crawl_functions),
|
||||||
|
("Base Domain Extraction", "🏠", self.test_base_domain_extraction),
|
||||||
|
("External URL Detection", "🌐", self.test_external_url_detection),
|
||||||
|
]
|
||||||
|
|
||||||
|
total_sections = len(sections)
|
||||||
|
for i, (section_name, icon, test_method) in enumerate(sections, 1):
|
||||||
|
print_progress(i - 1, total_sections, f"Running {section_name}")
|
||||||
|
test_method()
|
||||||
|
print_progress(i, total_sections, f"Completed {section_name}")
|
||||||
|
|
||||||
|
# Calculate execution time
|
||||||
|
execution_time = time.time() - self.test_start_time
|
||||||
|
|
||||||
|
# Print comprehensive statistics
|
||||||
|
self.print_comprehensive_stats(execution_time)
|
||||||
|
|
||||||
|
return len(self.tests_failed) == 0
|
||||||
|
|
||||||
|
def print_comprehensive_stats(self, execution_time):
|
||||||
|
"""Print comprehensive test statistics"""
|
||||||
|
print_header("📊 Test Results Summary", "📈")
|
||||||
|
|
||||||
|
# Overall statistics
|
||||||
|
success_rate = (self.tests_passed / self.tests_run * 100) if self.tests_run > 0 else 0
|
||||||
|
|
||||||
|
print(f"{Colors.BOLD}Overall Statistics:{Colors.RESET}")
|
||||||
|
print(f" Total Tests: {Colors.CYAN}{self.tests_run}{Colors.RESET}")
|
||||||
|
print(f" Passed: {Colors.GREEN}{self.tests_passed}{Colors.RESET}")
|
||||||
|
print(f" Failed: {Colors.RED}{len(self.tests_failed)}{Colors.RESET}")
|
||||||
|
print(f" Success Rate: {Colors.BRIGHT_CYAN}{success_rate:.1f}%{Colors.RESET}")
|
||||||
|
print(f" Execution Time: {Colors.YELLOW}{execution_time:.2f}s{Colors.RESET}")
|
||||||
|
|
||||||
|
# Performance indicator
|
||||||
|
if success_rate == 100:
|
||||||
|
print_success("🎉 Perfect! All tests passed!")
|
||||||
|
elif success_rate >= 90:
|
||||||
|
print_success("✅ Excellent! Nearly perfect results!")
|
||||||
|
elif success_rate >= 75:
|
||||||
|
print_warning("⚠️ Good results, but some improvements needed")
|
||||||
|
else:
|
||||||
|
print_error("❌ Significant issues detected - review failures below")
|
||||||
|
|
||||||
|
# Section-by-section breakdown
|
||||||
|
if self.section_stats:
|
||||||
|
print(f"\n{Colors.BOLD}Section Breakdown:{Colors.RESET}")
|
||||||
|
for section_name, stats in self.section_stats.items():
|
||||||
|
section_success_rate = (stats['passed'] / stats['run'] * 100) if stats['run'] > 0 else 0
|
||||||
|
status_icon = Colors.CHECK if stats['failed'] == 0 else Colors.CROSS
|
||||||
|
status_color = Colors.GREEN if stats['failed'] == 0 else Colors.RED
|
||||||
|
|
||||||
|
print(f" {status_icon} {section_name}: {Colors.CYAN}{stats['run']}{Colors.RESET} tests, "
|
||||||
|
f"{status_color}{stats['passed']} passed{Colors.RESET}, "
|
||||||
|
f"{Colors.RED}{stats['failed']} failed{Colors.RESET} "
|
||||||
|
f"({Colors.BRIGHT_CYAN}{section_success_rate:.1f}%{Colors.RESET})")
|
||||||
|
|
||||||
|
# Failed tests details
|
||||||
|
if self.tests_failed:
|
||||||
|
print(f"\n{Colors.BOLD}{Colors.RED}Failed Tests Details:{Colors.RESET}")
|
||||||
|
for i, failure in enumerate(self.tests_failed, 1):
|
||||||
|
print(f" {Colors.RED}{i}. {failure['name']}{Colors.RESET}")
|
||||||
|
if 'section' in failure and failure['section']:
|
||||||
|
print(f" Section: {Colors.YELLOW}{failure['section']}{Colors.RESET}")
|
||||||
|
print(f" Expected: {Colors.BRIGHT_RED}{failure['expected']}{Colors.RESET}")
|
||||||
|
print(f" Actual: {Colors.BRIGHT_RED}{failure['actual']}{Colors.RESET}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
if self.tests_failed:
|
||||||
|
print(f"{Colors.BOLD}{Colors.YELLOW}Recommendations:{Colors.RESET}")
|
||||||
|
print(f" • Review the {len(self.tests_failed)} failed test(s) above")
|
||||||
|
print(" • Check URL normalization logic for edge cases")
|
||||||
|
print(" • Verify query parameter handling")
|
||||||
|
print(" • Test with real-world URLs")
|
||||||
|
else:
|
||||||
|
print(f"\n{Colors.BOLD}{Colors.GREEN}Recommendations:{Colors.RESET}")
|
||||||
|
print(" • All tests passed! URL normalization is working correctly")
|
||||||
|
print(" • Consider adding more edge cases for future robustness")
|
||||||
|
print(" • Monitor performance with large-scale crawling")
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawling_integration():
|
||||||
|
"""Test integration with crawling scripts"""
|
||||||
|
print_section("Crawling Integration Test", "🔗")
|
||||||
|
|
||||||
|
# Test URLs that would be encountered in real crawling
|
||||||
|
test_urls = [
|
||||||
|
"https://example.com/blog/post?utm_source=newsletter&utm_medium=email",
|
||||||
|
"https://example.com/products?page=1&sort=price&ref=search",
|
||||||
|
"/about.html",
|
||||||
|
"../contact.html",
|
||||||
|
"//cdn.example.com/js/main.js",
|
||||||
|
"mailto:support@example.com",
|
||||||
|
"#top",
|
||||||
|
"",
|
||||||
|
None,
|
||||||
|
]
|
||||||
|
|
||||||
|
base_url = "https://example.com/current/page.html"
|
||||||
|
|
||||||
|
print("Testing real-world URL scenarios:")
|
||||||
|
for url in test_urls:
|
||||||
|
try:
|
||||||
|
normalized = normalize_url(url, base_url)
|
||||||
|
print(f" {url} -> {normalized}")
|
||||||
|
except (ValueError, TypeError) as e:
|
||||||
|
print(f" {url} -> ERROR: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print_header("🧪 URL Normalization Comprehensive Test Suite", "🧪")
|
||||||
|
print_info("Testing URL normalization functions with comprehensive scenarios and edge cases")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run the test suite
|
||||||
|
test_suite = URLNormalizationTestSuite()
|
||||||
|
success = test_suite.run_all_tests()
|
||||||
|
|
||||||
|
# Run integration tests
|
||||||
|
print()
|
||||||
|
test_crawling_integration()
|
||||||
|
|
||||||
|
# Final summary
|
||||||
|
print()
|
||||||
|
print_header("🏁 Final Test Summary", "🏁")
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print_success("🎉 ALL TESTS PASSED! URL normalization is working perfectly!")
|
||||||
|
print_info("The updated URL normalization functions are ready for production use.")
|
||||||
|
else:
|
||||||
|
print_error("❌ SOME TESTS FAILED! Please review the issues above.")
|
||||||
|
print_warning("URL normalization may have issues that need to be addressed before deployment.")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print_info("Test suite completed. Check the results above for detailed analysis.")
|
||||||
|
|
||||||
|
# Exit with appropriate code
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
Reference in New Issue
Block a user