Compare commits
8 Commits
fix/releas
...
v0.7.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
0eaa9f9895 | ||
|
|
bde1bba6a2 |
@@ -3,7 +3,7 @@ import warnings
|
|||||||
|
|
||||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
|
||||||
|
|
||||||
from .content_scraping_strategy import (
|
from .content_scraping_strategy import (
|
||||||
ContentScrapingStrategy,
|
ContentScrapingStrategy,
|
||||||
@@ -173,6 +173,7 @@ __all__ = [
|
|||||||
"CompilationResult",
|
"CompilationResult",
|
||||||
"ValidationResult",
|
"ValidationResult",
|
||||||
"ErrorDetail",
|
"ErrorDetail",
|
||||||
|
"LinkPreviewConfig"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# crawl4ai/__version__.py
|
# crawl4ai/__version__.py
|
||||||
|
|
||||||
# This is the version that will be used for stable releases
|
# This is the version that will be used for stable releases
|
||||||
__version__ = "0.7.0"
|
__version__ = "0.7.1"
|
||||||
|
|
||||||
# For nightly builds, this gets set during build process
|
# For nightly builds, this gets set during build process
|
||||||
__nightly_version__ = None
|
__nightly_version__ = None
|
||||||
|
|||||||
@@ -14,23 +14,8 @@ import hashlib
|
|||||||
from .js_snippet import load_js_script
|
from .js_snippet import load_js_script
|
||||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from playwright_stealth import StealthConfig
|
|
||||||
from .utils import get_chromium_path
|
from .utils import get_chromium_path
|
||||||
|
|
||||||
stealth_config = StealthConfig(
|
|
||||||
webdriver=True,
|
|
||||||
chrome_app=True,
|
|
||||||
chrome_csi=True,
|
|
||||||
chrome_load_times=True,
|
|
||||||
chrome_runtime=True,
|
|
||||||
navigator_languages=True,
|
|
||||||
navigator_plugins=True,
|
|
||||||
navigator_permissions=True,
|
|
||||||
webgl_vendor=True,
|
|
||||||
outerdimensions=True,
|
|
||||||
navigator_hardware_concurrency=True,
|
|
||||||
media_codecs=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
BROWSER_DISABLE_OPTIONS = [
|
BROWSER_DISABLE_OPTIONS = [
|
||||||
"--disable-background-networking",
|
"--disable-background-networking",
|
||||||
|
|||||||
@@ -1145,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
link_data["intrinsic_score"] = intrinsic_score
|
link_data["intrinsic_score"] = intrinsic_score
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fail gracefully - assign default score
|
# Fail gracefully - assign default score
|
||||||
link_data["intrinsic_score"] = float('inf')
|
link_data["intrinsic_score"] = 0
|
||||||
else:
|
else:
|
||||||
# No scoring enabled - assign infinity (all links equal priority)
|
# No scoring enabled - assign infinity (all links equal priority)
|
||||||
link_data["intrinsic_score"] = float('inf')
|
link_data["intrinsic_score"] = 0
|
||||||
|
|
||||||
is_external = is_external_url(normalized_href, base_domain)
|
is_external = is_external_url(normalized_href, base_domain)
|
||||||
if is_external:
|
if is_external:
|
||||||
|
|||||||
@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
import asyncio
|
||||||
|
|
||||||
# Initialize with custom adaptive parameters
|
async def main():
|
||||||
config = AdaptiveConfig(
|
|
||||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
|
||||||
max_depth=5, # Maximum crawl depth
|
|
||||||
max_pages=20, # Maximum number of pages to crawl
|
|
||||||
top_k_links=3, # Number of top links to follow per page
|
|
||||||
strategy="statistical", # 'statistical' or 'embedding'
|
|
||||||
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
|
||||||
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
|
||||||
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize adaptive crawler with web crawler
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
|
||||||
|
|
||||||
# Crawl and learn patterns
|
# Configure adaptive crawler
|
||||||
state = await adaptive_crawler.digest(
|
config = AdaptiveConfig(
|
||||||
start_url="https://news.example.com/article/12345",
|
strategy="statistical", # or "embedding" for semantic understanding
|
||||||
query="latest news articles and content"
|
max_pages=10,
|
||||||
|
confidence_threshold=0.7, # Stop at 70% confidence
|
||||||
|
top_k_links=3, # Follow top 3 links per page
|
||||||
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||||
)
|
)
|
||||||
|
|
||||||
# Access results and confidence
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
|
||||||
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
print("Starting adaptive crawl about Python decorators...")
|
||||||
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/glossary.html",
|
||||||
|
query="python decorators functions wrapping"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Crawling Complete!")
|
||||||
|
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||||
|
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
# Get most relevant content
|
||||||
|
relevant = adaptive.get_relevant_content(top_k=3)
|
||||||
|
print(f"\nMost Relevant Pages:")
|
||||||
|
for i, page in enumerate(relevant, 1):
|
||||||
|
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
|
|
||||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||||
|
|
||||||
### The Three-Layer Scoring System
|
### Intelligent Link Analysis and Scoring
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
|
import asyncio
|
||||||
|
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||||
|
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
async def main():
|
||||||
link_config = LinkPreviewConfig(
|
# Configure intelligent link analysis
|
||||||
include_internal=True,
|
link_config = LinkPreviewConfig(
|
||||||
include_external=False,
|
include_internal=True,
|
||||||
max_links=10,
|
include_external=False,
|
||||||
concurrency=5,
|
max_links=10,
|
||||||
query="python tutorial", # For contextual scoring
|
concurrency=5,
|
||||||
score_threshold=0.3,
|
query="python tutorial", # For contextual scoring
|
||||||
verbose=True
|
score_threshold=0.3,
|
||||||
)
|
verbose=True
|
||||||
|
|
||||||
# Use in your crawl
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://tech-blog.example.com",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
link_preview_config=link_config,
|
|
||||||
score_links=True, # Enable intrinsic scoring
|
|
||||||
cache_mode=CacheMode.BYPASS
|
|
||||||
)
|
)
|
||||||
)
|
# Use in your crawl
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.geeksforgeeks.org/",
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
link_preview_config=link_config,
|
||||||
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
if result.success and result.links:
|
if result.success and result.links:
|
||||||
# Get scored links
|
for link in result.links.get("internal", []):
|
||||||
internal_links = result.links.get("internal", [])
|
text = link.get('text', 'No text')[:40]
|
||||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
print(
|
||||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
text,
|
||||||
|
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||||
|
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||||
|
)
|
||||||
|
|
||||||
# Create a scoring table
|
asyncio.run(main())
|
||||||
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
|
||||||
table.add_column("Link Text", style="cyan", width=40)
|
|
||||||
table.add_column("Intrinsic Score", justify="center")
|
|
||||||
table.add_column("Contextual Score", justify="center")
|
|
||||||
table.add_column("Total Score", justify="center", style="bold green")
|
|
||||||
|
|
||||||
for link in scored_links[:5]:
|
|
||||||
text = link.get('text', 'No text')[:40]
|
|
||||||
table.add_row(
|
|
||||||
text,
|
|
||||||
f"{link.get('intrinsic_score', 0):.1f}/10",
|
|
||||||
f"{link.get('contextual_score', 0):.2f}/1",
|
|
||||||
f"{link.get('total_score', 0):.3f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
console.print(table)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
@@ -223,58 +221,34 @@ console.print(table)
|
|||||||
### Technical Architecture
|
### Technical Architecture
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import asyncio
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
|
|
||||||
# Basic discovery - find all product pages
|
async def main():
|
||||||
seeder_config = SeedingConfig(
|
async with AsyncUrlSeeder() as seeder:
|
||||||
# Discovery sources
|
# Discover Python tutorial URLs
|
||||||
source="cc+sitemap", # Sitemap + Common Crawl
|
config = SeedingConfig(
|
||||||
|
source="sitemap", # Use sitemap
|
||||||
# Filtering
|
pattern="*python*", # URL pattern filter
|
||||||
pattern="*/product/*", # URL pattern matching
|
extract_head=True, # Get metadata
|
||||||
|
query="python tutorial", # For relevance scoring
|
||||||
# Validation
|
scoring_method="bm25",
|
||||||
live_check=True, # Verify URLs are alive
|
score_threshold=0.2,
|
||||||
max_urls=50, # Stop at 50 URLs
|
max_urls=10
|
||||||
|
)
|
||||||
# Performance
|
|
||||||
concurrency=100, # Maximum concurrent requests for live checks/head extraction
|
print("Discovering Python async tutorial URLs...")
|
||||||
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
)
|
|
||||||
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
|
print(f"\n{i}. {url_info['url']}")
|
||||||
|
if url_info.get('relevance_score'):
|
||||||
|
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||||
|
if url_info.get('head_data', {}).get('title'):
|
||||||
|
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||||
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
asyncio.run(main())
|
||||||
console.print("Discovering URLs from Python docs...")
|
|
||||||
urls = await seeder.urls("docs.python.org", seeding_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(urls)} URLs")
|
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
|
||||||
research_config = SeedingConfig(
|
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
|
||||||
pattern="*/blog/*", # Blog posts only
|
|
||||||
|
|
||||||
# Content relevance
|
|
||||||
extract_head=True, # Get meta tags
|
|
||||||
query="quantum computing tutorials",
|
|
||||||
scoring_method="bm25", # BM25 scoring method
|
|
||||||
score_threshold=0.4, # High relevance only
|
|
||||||
|
|
||||||
# Smart filtering
|
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
|
||||||
)
|
|
||||||
|
|
||||||
# Discover with progress tracking
|
|
||||||
discovered = []
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
|
||||||
discovered = await seeder.urls("https://physics-blog.com", research_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(discovered)} URLs")
|
|
||||||
|
|
||||||
# Results include scores and metadata
|
|
||||||
for url_data in discovered[:5]:
|
|
||||||
print(f"URL: {url_data['url']}")
|
|
||||||
print(f"Score: {url_data['relevance_score']:.3f}")
|
|
||||||
print(f"Title: {url_data['head_data']['title']}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
|
|||||||
43
docs/blog/release-v0.7.1.md
Normal file
43
docs/blog/release-v0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||||
|
|
||||||
|
*July 17, 2025 • 2 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
A small maintenance release that removes unused code and improves documentation.
|
||||||
|
|
||||||
|
## 🎯 What's Changed
|
||||||
|
|
||||||
|
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||||
|
- **Updated documentation** with better examples and parameter explanations
|
||||||
|
- **Fixed virtual scroll configuration** examples in docs
|
||||||
|
|
||||||
|
## 🧹 Code Cleanup
|
||||||
|
|
||||||
|
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Removed unused code:
|
||||||
|
from playwright_stealth import StealthConfig
|
||||||
|
stealth_config = StealthConfig(...) # This was never used
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📖 Documentation Updates
|
||||||
|
|
||||||
|
- Fixed adaptive crawling parameter examples
|
||||||
|
- Updated session management documentation
|
||||||
|
- Corrected virtual scroll configuration examples
|
||||||
|
|
||||||
|
## 🚀 Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install crawl4ai==0.7.1
|
||||||
|
```
|
||||||
|
|
||||||
|
No breaking changes - upgrade directly from v0.7.0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Questions? Issues?
|
||||||
|
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
@@ -18,7 +18,7 @@ Usage:
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
|
|
||||||
async def basic_link_head_extraction():
|
async def basic_link_head_extraction():
|
||||||
|
|||||||
@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
import asyncio
|
||||||
|
|
||||||
# Initialize with custom adaptive parameters
|
async def main():
|
||||||
config = AdaptiveConfig(
|
|
||||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
|
||||||
max_depth=5, # Maximum crawl depth
|
|
||||||
max_pages=20, # Maximum number of pages to crawl
|
|
||||||
top_k_links=3, # Number of top links to follow per page
|
|
||||||
strategy="statistical", # 'statistical' or 'embedding'
|
|
||||||
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
|
||||||
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
|
||||||
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize adaptive crawler with web crawler
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
|
||||||
|
|
||||||
# Crawl and learn patterns
|
# Configure adaptive crawler
|
||||||
state = await adaptive_crawler.digest(
|
config = AdaptiveConfig(
|
||||||
start_url="https://news.example.com/article/12345",
|
strategy="statistical", # or "embedding" for semantic understanding
|
||||||
query="latest news articles and content"
|
max_pages=10,
|
||||||
|
confidence_threshold=0.7, # Stop at 70% confidence
|
||||||
|
top_k_links=3, # Follow top 3 links per page
|
||||||
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||||
)
|
)
|
||||||
|
|
||||||
# Access results and confidence
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
|
||||||
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
print("Starting adaptive crawl about Python decorators...")
|
||||||
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/glossary.html",
|
||||||
|
query="python decorators functions wrapping"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Crawling Complete!")
|
||||||
|
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||||
|
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
# Get most relevant content
|
||||||
|
relevant = adaptive.get_relevant_content(top_k=3)
|
||||||
|
print(f"\nMost Relevant Pages:")
|
||||||
|
for i, page in enumerate(relevant, 1):
|
||||||
|
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
|
|
||||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||||
|
|
||||||
### The Three-Layer Scoring System
|
### Intelligent Link Analysis and Scoring
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
|
import asyncio
|
||||||
|
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||||
|
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
async def main():
|
||||||
link_config = LinkPreviewConfig(
|
# Configure intelligent link analysis
|
||||||
include_internal=True,
|
link_config = LinkPreviewConfig(
|
||||||
include_external=False,
|
include_internal=True,
|
||||||
max_links=10,
|
include_external=False,
|
||||||
concurrency=5,
|
max_links=10,
|
||||||
query="python tutorial", # For contextual scoring
|
concurrency=5,
|
||||||
score_threshold=0.3,
|
query="python tutorial", # For contextual scoring
|
||||||
verbose=True
|
score_threshold=0.3,
|
||||||
)
|
verbose=True
|
||||||
|
|
||||||
# Use in your crawl
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://tech-blog.example.com",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
link_preview_config=link_config,
|
|
||||||
score_links=True, # Enable intrinsic scoring
|
|
||||||
cache_mode=CacheMode.BYPASS
|
|
||||||
)
|
)
|
||||||
)
|
# Use in your crawl
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.geeksforgeeks.org/",
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
link_preview_config=link_config,
|
||||||
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
if result.success and result.links:
|
if result.success and result.links:
|
||||||
# Get scored links
|
for link in result.links.get("internal", []):
|
||||||
internal_links = result.links.get("internal", [])
|
text = link.get('text', 'No text')[:40]
|
||||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
print(
|
||||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
text,
|
||||||
|
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||||
|
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||||
|
)
|
||||||
|
|
||||||
# Create a scoring table
|
asyncio.run(main())
|
||||||
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
|
||||||
table.add_column("Link Text", style="cyan", width=40)
|
|
||||||
table.add_column("Intrinsic Score", justify="center")
|
|
||||||
table.add_column("Contextual Score", justify="center")
|
|
||||||
table.add_column("Total Score", justify="center", style="bold green")
|
|
||||||
|
|
||||||
for link in scored_links[:5]:
|
|
||||||
text = link.get('text', 'No text')[:40]
|
|
||||||
table.add_row(
|
|
||||||
text,
|
|
||||||
f"{link.get('intrinsic_score', 0):.1f}/10",
|
|
||||||
f"{link.get('contextual_score', 0):.2f}/1",
|
|
||||||
f"{link.get('total_score', 0):.3f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
console.print(table)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
@@ -223,58 +221,34 @@ console.print(table)
|
|||||||
### Technical Architecture
|
### Technical Architecture
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import asyncio
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
|
|
||||||
# Basic discovery - find all product pages
|
async def main():
|
||||||
seeder_config = SeedingConfig(
|
async with AsyncUrlSeeder() as seeder:
|
||||||
# Discovery sources
|
# Discover Python tutorial URLs
|
||||||
source="cc+sitemap", # Sitemap + Common Crawl
|
config = SeedingConfig(
|
||||||
|
source="sitemap", # Use sitemap
|
||||||
# Filtering
|
pattern="*python*", # URL pattern filter
|
||||||
pattern="*/product/*", # URL pattern matching
|
extract_head=True, # Get metadata
|
||||||
|
query="python tutorial", # For relevance scoring
|
||||||
# Validation
|
scoring_method="bm25",
|
||||||
live_check=True, # Verify URLs are alive
|
score_threshold=0.2,
|
||||||
max_urls=50, # Stop at 50 URLs
|
max_urls=10
|
||||||
|
)
|
||||||
# Performance
|
|
||||||
concurrency=100, # Maximum concurrent requests for live checks/head extraction
|
print("Discovering Python async tutorial URLs...")
|
||||||
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
)
|
|
||||||
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
|
print(f"\n{i}. {url_info['url']}")
|
||||||
|
if url_info.get('relevance_score'):
|
||||||
|
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||||
|
if url_info.get('head_data', {}).get('title'):
|
||||||
|
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||||
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
asyncio.run(main())
|
||||||
console.print("Discovering URLs from Python docs...")
|
|
||||||
urls = await seeder.urls("docs.python.org", seeding_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(urls)} URLs")
|
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
|
||||||
research_config = SeedingConfig(
|
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
|
||||||
pattern="*/blog/*", # Blog posts only
|
|
||||||
|
|
||||||
# Content relevance
|
|
||||||
extract_head=True, # Get meta tags
|
|
||||||
query="quantum computing tutorials",
|
|
||||||
scoring_method="bm25", # BM25 scoring method
|
|
||||||
score_threshold=0.4, # High relevance only
|
|
||||||
|
|
||||||
# Smart filtering
|
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
|
||||||
)
|
|
||||||
|
|
||||||
# Discover with progress tracking
|
|
||||||
discovered = []
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
|
||||||
discovered = await seeder.urls("https://physics-blog.com", research_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(discovered)} URLs")
|
|
||||||
|
|
||||||
# Results include scores and metadata
|
|
||||||
for url_data in discovered[:5]:
|
|
||||||
print(f"URL: {url_data['url']}")
|
|
||||||
print(f"Score: {url_data['relevance_score']:.3f}")
|
|
||||||
print(f"Title: {url_data['head_data']['title']}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
async def extract_link_heads_example():
|
async def extract_link_heads_example():
|
||||||
"""
|
"""
|
||||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
|||||||
The `LinkPreviewConfig` class supports these options:
|
The `LinkPreviewConfig` class supports these options:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
link_preview_config = LinkPreviewConfig(
|
link_preview_config = LinkPreviewConfig(
|
||||||
# BASIC SETTINGS
|
# BASIC SETTINGS
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ from rich import box
|
|||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
|
from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
|
||||||
from crawl4ai import c4a_compile, CompilationResult
|
from crawl4ai import c4a_compile, CompilationResult
|
||||||
|
|
||||||
# Initialize Rich console for beautiful output
|
# Initialize Rich console for beautiful output
|
||||||
|
|||||||
@@ -13,14 +13,13 @@ from crawl4ai import (
|
|||||||
BrowserConfig,
|
BrowserConfig,
|
||||||
CacheMode,
|
CacheMode,
|
||||||
# New imports for v0.7.0
|
# New imports for v0.7.0
|
||||||
LinkPreviewConfig,
|
|
||||||
VirtualScrollConfig,
|
VirtualScrollConfig,
|
||||||
|
LinkPreviewConfig,
|
||||||
AdaptiveCrawler,
|
AdaptiveCrawler,
|
||||||
AdaptiveConfig,
|
AdaptiveConfig,
|
||||||
AsyncUrlSeeder,
|
AsyncUrlSeeder,
|
||||||
SeedingConfig,
|
SeedingConfig,
|
||||||
c4a_compile,
|
c4a_compile,
|
||||||
CompilationResult
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -170,16 +169,16 @@ async def demo_url_seeder():
|
|||||||
# Discover Python tutorial URLs
|
# Discover Python tutorial URLs
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="sitemap", # Use sitemap
|
source="sitemap", # Use sitemap
|
||||||
pattern="*tutorial*", # URL pattern filter
|
pattern="*python*", # URL pattern filter
|
||||||
extract_head=True, # Get metadata
|
extract_head=True, # Get metadata
|
||||||
query="python async programming", # For relevance scoring
|
query="python tutorial", # For relevance scoring
|
||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
score_threshold=0.2,
|
score_threshold=0.2,
|
||||||
max_urls=10
|
max_urls=10
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Discovering Python async tutorial URLs...")
|
print("Discovering Python async tutorial URLs...")
|
||||||
urls = await seeder.urls("docs.python.org", config)
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
|
|
||||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
for i, url_info in enumerate(urls[:5], 1):
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
@@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
|||||||
print(f"❌ Compilation error: {result.first_error.message}")
|
print(f"❌ Compilation error: {result.first_error.message}")
|
||||||
|
|
||||||
|
|
||||||
async def demo_pdf_support():
|
|
||||||
"""
|
|
||||||
Demo 6: PDF Parsing Support
|
|
||||||
|
|
||||||
Shows how to extract content from PDF files.
|
|
||||||
Note: Requires 'pip install crawl4ai[pdf]'
|
|
||||||
"""
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("📄 DEMO 6: PDF Parsing Support")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if PDF support is installed
|
|
||||||
import PyPDF2
|
|
||||||
|
|
||||||
# Example: Process a PDF URL
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS,
|
|
||||||
pdf=True, # Enable PDF generation
|
|
||||||
extract_text_from_pdf=True # Extract text content
|
|
||||||
)
|
|
||||||
|
|
||||||
print("PDF parsing is available!")
|
|
||||||
print("You can now crawl PDF URLs and extract their content.")
|
|
||||||
print("\nExample usage:")
|
|
||||||
print(' result = await crawler.arun("https://example.com/document.pdf")')
|
|
||||||
print(' pdf_text = result.extracted_content # Contains extracted text')
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
print("⚠️ PDF support not installed.")
|
|
||||||
print("Install with: pip install crawl4ai[pdf]")
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Run all demos"""
|
"""Run all demos"""
|
||||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||||
@@ -289,7 +255,6 @@ async def main():
|
|||||||
("Virtual Scroll", demo_virtual_scroll),
|
("Virtual Scroll", demo_virtual_scroll),
|
||||||
("URL Seeder", demo_url_seeder),
|
("URL Seeder", demo_url_seeder),
|
||||||
("C4A Script", demo_c4a_script),
|
("C4A Script", demo_c4a_script),
|
||||||
("PDF Support", demo_pdf_support)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for name, demo_func in demos:
|
for name, demo_func in demos:
|
||||||
@@ -309,7 +274,6 @@ async def main():
|
|||||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||||
print("• C4A Script: Simple language for complex automations")
|
print("• C4A Script: Simple language for complex automations")
|
||||||
print("• PDF Support: Extract content from PDF documents")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
345
tests/docker/simple_api_test.py
Normal file
345
tests/docker/simple_api_test.py
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple API Test for Crawl4AI Docker Server v0.7.0
|
||||||
|
Uses only built-in Python modules to test all endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
BASE_URL = "http://localhost:11234" # Change to your server URL
|
||||||
|
TEST_TIMEOUT = 30
|
||||||
|
|
||||||
|
class SimpleApiTester:
|
||||||
|
def __init__(self, base_url: str = BASE_URL):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.token = None
|
||||||
|
self.results = []
|
||||||
|
|
||||||
|
def log(self, message: str):
|
||||||
|
print(f"[INFO] {message}")
|
||||||
|
|
||||||
|
def test_get_endpoint(self, endpoint: str) -> Dict:
|
||||||
|
"""Test a GET endpoint"""
|
||||||
|
url = f"{self.base_url}{endpoint}"
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
if self.token:
|
||||||
|
req.add_header('Authorization', f'Bearer {self.token}')
|
||||||
|
|
||||||
|
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
status_code = response.getcode()
|
||||||
|
content = response.read().decode('utf-8')
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(content)
|
||||||
|
except:
|
||||||
|
data = {"raw_response": content[:200]}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "GET",
|
||||||
|
"status": "PASS" if status_code < 400 else "FAIL",
|
||||||
|
"status_code": status_code,
|
||||||
|
"response_time": response_time,
|
||||||
|
"data": data
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "GET",
|
||||||
|
"status": "FAIL",
|
||||||
|
"status_code": None,
|
||||||
|
"response_time": response_time,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict:
|
||||||
|
"""Test a POST endpoint"""
|
||||||
|
url = f"{self.base_url}{endpoint}"
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.dumps(payload).encode('utf-8')
|
||||||
|
req = urllib.request.Request(url, data=data, method='POST')
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
|
||||||
|
if self.token:
|
||||||
|
req.add_header('Authorization', f'Bearer {self.token}')
|
||||||
|
|
||||||
|
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
status_code = response.getcode()
|
||||||
|
content = response.read().decode('utf-8')
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(content)
|
||||||
|
except:
|
||||||
|
data = {"raw_response": content[:200]}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "POST",
|
||||||
|
"status": "PASS" if status_code < 400 else "FAIL",
|
||||||
|
"status_code": status_code,
|
||||||
|
"response_time": response_time,
|
||||||
|
"data": data
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "POST",
|
||||||
|
"status": "FAIL",
|
||||||
|
"status_code": None,
|
||||||
|
"response_time": response_time,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def print_result(self, result: Dict):
|
||||||
|
"""Print a formatted test result"""
|
||||||
|
status_color = {
|
||||||
|
"PASS": "✅",
|
||||||
|
"FAIL": "❌",
|
||||||
|
"SKIP": "⏭️"
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} "
|
||||||
|
f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}")
|
||||||
|
|
||||||
|
if result['status'] == 'FAIL' and 'error' in result:
|
||||||
|
print(f" Error: {result['error']}")
|
||||||
|
|
||||||
|
self.results.append(result)
|
||||||
|
|
||||||
|
def run_all_tests(self):
|
||||||
|
"""Run all API tests"""
|
||||||
|
print("🚀 Starting Crawl4AI v0.7.0 API Test Suite")
|
||||||
|
print(f"📡 Testing server at: {self.base_url}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# # Test basic endpoints
|
||||||
|
# print("\n=== BASIC ENDPOINTS ===")
|
||||||
|
|
||||||
|
# # Health check
|
||||||
|
# result = self.test_get_endpoint("/health")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
|
||||||
|
# # Schema endpoint
|
||||||
|
# result = self.test_get_endpoint("/schema")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Metrics endpoint
|
||||||
|
# result = self.test_get_endpoint("/metrics")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Root redirect
|
||||||
|
# result = self.test_get_endpoint("/")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Test authentication
|
||||||
|
# print("\n=== AUTHENTICATION ===")
|
||||||
|
|
||||||
|
# # Get token
|
||||||
|
# token_payload = {"email": "test@example.com"}
|
||||||
|
# result = self.test_post_endpoint("/token", token_payload)
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Extract token if successful
|
||||||
|
# if result['status'] == 'PASS' and 'data' in result:
|
||||||
|
# token = result['data'].get('access_token')
|
||||||
|
# if token:
|
||||||
|
# self.token = token
|
||||||
|
# self.log(f"Successfully obtained auth token: {token[:20]}...")
|
||||||
|
|
||||||
|
# Test core APIs
|
||||||
|
print("\n=== CORE APIs ===")
|
||||||
|
|
||||||
|
test_url = "https://example.com"
|
||||||
|
|
||||||
|
# Test markdown endpoint
|
||||||
|
md_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"f": "fit",
|
||||||
|
"q": "test query",
|
||||||
|
"c": "0"
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/md", md_payload)
|
||||||
|
# print(result['data'].get('markdown', ''))
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test HTML endpoint
|
||||||
|
html_payload = {"url": test_url}
|
||||||
|
result = self.test_post_endpoint("/html", html_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test screenshot endpoint
|
||||||
|
screenshot_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"screenshot_wait_for": 2
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/screenshot", screenshot_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test PDF endpoint
|
||||||
|
pdf_payload = {"url": test_url}
|
||||||
|
result = self.test_post_endpoint("/pdf", pdf_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test JavaScript execution
|
||||||
|
js_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"scripts": ["(() => document.title)()"]
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/execute_js", js_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test crawl endpoint
|
||||||
|
crawl_payload = {
|
||||||
|
"urls": [test_url],
|
||||||
|
"browser_config": {},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test config dump
|
||||||
|
config_payload = {"code": "CrawlerRunConfig()"}
|
||||||
|
result = self.test_post_endpoint("/config/dump", config_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test LLM endpoint
|
||||||
|
llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content"
|
||||||
|
result = self.test_get_endpoint(llm_endpoint)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test ask endpoint
|
||||||
|
ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5"
|
||||||
|
result = self.test_get_endpoint(ask_endpoint)
|
||||||
|
print(result)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test job APIs
|
||||||
|
print("\n=== JOB APIs ===")
|
||||||
|
|
||||||
|
# Test LLM job
|
||||||
|
llm_job_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"q": "Extract main content",
|
||||||
|
"cache": False
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/llm/job", llm_job_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test crawl job
|
||||||
|
crawl_job_payload = {
|
||||||
|
"urls": [test_url],
|
||||||
|
"browser_config": {},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/crawl/job", crawl_job_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test MCP
|
||||||
|
print("\n=== MCP APIs ===")
|
||||||
|
|
||||||
|
# Test MCP schema
|
||||||
|
result = self.test_get_endpoint("/mcp/schema")
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test error handling
|
||||||
|
print("\n=== ERROR HANDLING ===")
|
||||||
|
|
||||||
|
# Test invalid URL
|
||||||
|
invalid_payload = {"url": "invalid-url", "f": "fit"}
|
||||||
|
result = self.test_post_endpoint("/md", invalid_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test invalid endpoint
|
||||||
|
result = self.test_get_endpoint("/nonexistent")
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
self.print_summary()
|
||||||
|
|
||||||
|
def print_summary(self):
|
||||||
|
"""Print test results summary"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("📊 TEST RESULTS SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
total = len(self.results)
|
||||||
|
passed = sum(1 for r in self.results if r['status'] == 'PASS')
|
||||||
|
failed = sum(1 for r in self.results if r['status'] == 'FAIL')
|
||||||
|
|
||||||
|
print(f"Total Tests: {total}")
|
||||||
|
print(f"✅ Passed: {passed}")
|
||||||
|
print(f"❌ Failed: {failed}")
|
||||||
|
print(f"📈 Success Rate: {(passed/total)*100:.1f}%")
|
||||||
|
|
||||||
|
if failed > 0:
|
||||||
|
print("\n❌ FAILED TESTS:")
|
||||||
|
for result in self.results:
|
||||||
|
if result['status'] == 'FAIL':
|
||||||
|
print(f" • {result['method']} {result['endpoint']}")
|
||||||
|
if 'error' in result:
|
||||||
|
print(f" Error: {result['error']}")
|
||||||
|
|
||||||
|
# Performance statistics
|
||||||
|
response_times = [r['response_time'] for r in self.results if r['response_time'] > 0]
|
||||||
|
if response_times:
|
||||||
|
avg_time = sum(response_times) / len(response_times)
|
||||||
|
max_time = max(response_times)
|
||||||
|
print(f"\n⏱️ Average Response Time: {avg_time:.3f}s")
|
||||||
|
print(f"⏱️ Max Response Time: {max_time:.3f}s")
|
||||||
|
|
||||||
|
# Save detailed report
|
||||||
|
report_file = f"crawl4ai_test_report_{int(time.time())}.json"
|
||||||
|
with open(report_file, 'w') as f:
|
||||||
|
json.dump({
|
||||||
|
"timestamp": time.time(),
|
||||||
|
"server_url": self.base_url,
|
||||||
|
"version": "0.7.0",
|
||||||
|
"summary": {
|
||||||
|
"total": total,
|
||||||
|
"passed": passed,
|
||||||
|
"failed": failed
|
||||||
|
},
|
||||||
|
"results": self.results
|
||||||
|
}, f, indent=2)
|
||||||
|
|
||||||
|
print(f"\n📄 Detailed report saved to: {report_file}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main test runner"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite')
|
||||||
|
parser.add_argument('--url', default=BASE_URL, help='Base URL of the server')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
tester = SimpleApiTester(args.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.run_all_tests()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n🛑 Test suite interrupted by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n💥 Test suite failed with error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
|
|||||||
|
|
||||||
from crawl4ai.models import Link
|
from crawl4ai.models import Link
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
import asyncio
|
import asyncio
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
@@ -237,7 +237,7 @@ def test_config_examples():
|
|||||||
print(f" {key}: {value}")
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
print(" Usage:")
|
print(" Usage:")
|
||||||
print(" from crawl4ai.async_configs import LinkPreviewConfig")
|
print(" from crawl4ai import LinkPreviewConfig")
|
||||||
print(" config = CrawlerRunConfig(")
|
print(" config = CrawlerRunConfig(")
|
||||||
print(" link_preview_config=LinkPreviewConfig(")
|
print(" link_preview_config=LinkPreviewConfig(")
|
||||||
for key, value in config_dict.items():
|
for key, value in config_dict.items():
|
||||||
|
|||||||
Reference in New Issue
Block a user