diff --git a/README.md b/README.md index b74f386e..97a907a1 100644 --- a/README.md +++ b/README.md @@ -523,15 +523,18 @@ async def test_news_crawl(): - **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically: ```python config = AdaptiveConfig( - confidence_threshold=0.7, - max_history=100, - learning_rate=0.2 + confidence_threshold=0.7, # Min confidence to stop crawling + max_depth=5, # Maximum crawl depth + max_pages=20, # Maximum number of pages to crawl + strategy="statistical" ) - result = await crawler.arun( - "https://news.example.com", - config=CrawlerRunConfig(adaptive_config=config) - ) + async with AsyncWebCrawler() as crawler: + adaptive_crawler = AdaptiveCrawler(crawler, config) + state = await adaptive_crawler.digest( + start_url="https://news.example.com", + query="latest news content" + ) # Crawler learns patterns and improves extraction over time ``` diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 7a75e76d..be3cab0a 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ import warnings from .async_webcrawler import AsyncWebCrawler, CacheMode # MODIFIED: Add SeedingConfig and VirtualScrollConfig here -from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig from .content_scraping_strategy import ( ContentScrapingStrategy, @@ -173,6 +173,7 @@ __all__ = [ "CompilationResult", "ValidationResult", "ErrorDetail", + "LinkPreviewConfig" ] diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 3a436986..4a3ac419 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,7 +1,7 @@ # crawl4ai/__version__.py # This is the version that will be used for stable releases -__version__ = "0.7.0" +__version__ = "0.7.1" # For nightly builds, this gets set during build process __nightly_version__ = None diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 6ee43961..08c1f52f 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -14,23 +14,8 @@ import hashlib from .js_snippet import load_js_script from .config import DOWNLOAD_PAGE_TIMEOUT from .async_configs import BrowserConfig, CrawlerRunConfig -from playwright_stealth import StealthConfig from .utils import get_chromium_path -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) BROWSER_DISABLE_OPTIONS = [ "--disable-background-networking", diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index f1ea5fa5..3751d52f 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1145,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): link_data["intrinsic_score"] = intrinsic_score except Exception: # Fail gracefully - assign default score - link_data["intrinsic_score"] = float('inf') + link_data["intrinsic_score"] = 0 else: # No scoring enabled - assign infinity (all links equal priority) - link_data["intrinsic_score"] = float('inf') + link_data["intrinsic_score"] = 0 is_external = is_external_url(normalized_href, base_domain) if is_external: diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 4ae9a689..1b474a99 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages -- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization +- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering -- **PDF Parsing**: Extract data from PDF documents - **Performance Optimizations**: Significant speed and memory improvements ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning @@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: - Extraction confidence scores ```python -from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState +from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig +import asyncio -# Initialize with custom learning parameters -config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to use learned patterns - max_history=100, # Remember last 100 crawls per domain - learning_rate=0.2, # How quickly to adapt to changes - patterns_per_page=3, # Patterns to learn per page type - extraction_strategy='css' # 'css' or 'xpath' -) - -adaptive_crawler = AdaptiveCrawler(config) - -# First crawl - crawler learns the structure -async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://news.example.com/article/12345", - config=CrawlerRunConfig( - adaptive_config=config, - extraction_hints={ # Optional hints to speed up learning - "title": "article h1", - "content": "article .body-content" - } - ) +async def main(): + + # Configure adaptive crawler + config = AdaptiveConfig( + strategy="statistical", # or "embedding" for semantic understanding + max_pages=10, + confidence_threshold=0.7, # Stop at 70% confidence + top_k_links=3, # Follow top 3 links per page + min_gain_threshold=0.05 # Need 5% information gain to continue ) - # Crawler identifies and stores patterns - if result.success: - state = adaptive_crawler.get_state("news.example.com") - print(f"Learned {len(state.patterns)} patterns") - print(f"Confidence: {state.avg_confidence:.2%}") + async with AsyncWebCrawler(verbose=False) as crawler: + adaptive = AdaptiveCrawler(crawler, config) + + print("Starting adaptive crawl about Python decorators...") + result = await adaptive.digest( + start_url="https://docs.python.org/3/glossary.html", + query="python decorators functions wrapping" + ) + + print(f"\n✅ Crawling Complete!") + print(f"• Confidence Level: {adaptive.confidence:.0%}") + print(f"• Pages Crawled: {len(result.crawled_urls)}") + print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents") + + # Get most relevant content + relevant = adaptive.get_relevant_content(top_k=3) + print(f"\nMost Relevant Pages:") + for i, page in enumerate(relevant, 1): + print(f"{i}. {page['url']} (relevance: {page['score']:.2%})") -# Subsequent crawls - uses learned patterns -result2 = await crawler.arun( - "https://news.example.com/article/67890", - config=CrawlerRunConfig(adaptive_config=config) -) -# Automatically extracts using learned patterns! +asyncio.run(main()) ``` **Expected Real-World Impact:** @@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig( container_selector="[data-testid='primaryColumn']", scroll_count=20, # Number of scrolls scroll_by="container_height", # Smart scrolling by container size - wait_after_scroll=1.0, # Let content load - capture_method="incremental", # Capture new content on each scroll - deduplicate=True # Remove duplicate elements + wait_after_scroll=1.0 # Let content load ) # For e-commerce product grids (Instagram style) @@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig( container_selector="main .product-grid", scroll_count=30, scroll_by=800, # Fixed pixel scrolling - wait_after_scroll=1.5, # Images need time - stop_on_no_change=True # Smart stopping + wait_after_scroll=1.5 # Images need time ) # For news feeds with lazy loading @@ -111,9 +104,7 @@ news_config = VirtualScrollConfig( container_selector=".article-feed", scroll_count=50, scroll_by="page_height", # Viewport-based scrolling - wait_after_scroll=0.5, - wait_for_selector=".article-card", # Wait for specific elements - timeout=30000 # Max 30 seconds total + wait_after_scroll=0.5 # Wait for content to load ) # Use it in your crawl @@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler: **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals. -### The Three-Layer Scoring System +### Intelligent Link Analysis and Scoring ```python -from crawl4ai import LinkPreviewConfig +import asyncio +from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler +from crawl4ai.adaptive_crawler import LinkPreviewConfig -# Configure intelligent link analysis -link_config = LinkPreviewConfig( - # What to analyze - include_internal=True, - include_external=True, - max_links=100, # Analyze top 100 links - - # Relevance scoring - query="machine learning tutorials", # Your interest - score_threshold=0.3, # Minimum relevance score - - # Performance - concurrent_requests=10, # Parallel processing - timeout_per_link=5000, # 5s per link - - # Advanced scoring weights - scoring_weights={ - "intrinsic": 0.3, # Link quality indicators - "contextual": 0.5, # Relevance to query - "popularity": 0.2 # Link prominence - } -) - -# Use in your crawl -result = await crawler.arun( - "https://tech-blog.example.com", - config=CrawlerRunConfig( - link_preview_config=link_config, - score_links=True +async def main(): + # Configure intelligent link analysis + link_config = LinkPreviewConfig( + include_internal=True, + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) -) + # Use in your crawl + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://www.geeksforgeeks.org/", + config=CrawlerRunConfig( + link_preview_config=link_config, + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS + ) + ) -# Access scored and sorted links -for link in result.links["internal"][:10]: # Top 10 internal links - print(f"Score: {link['total_score']:.3f}") - print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes - print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query - print(f" URL: {link['href']}") - print(f" Title: {link['head_data']['title']}") - print(f" Description: {link['head_data']['meta']['description'][:100]}...") + # Access scored and sorted links + if result.success and result.links: + for link in result.links.get("internal", []): + text = link.get('text', 'No text')[:40] + print( + text, + f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10", + f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1", + f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000" + ) + +asyncio.run(main()) ``` **Scoring Components:** -1. **Intrinsic Score (0-10)**: Based on link quality indicators +1. **Intrinsic Score**: Based on link quality indicators - Position on page (navigation, content, footer) - Link attributes (rel, title, class names) - Anchor text quality and length - URL structure and depth -2. **Contextual Score (0-1)**: Relevance to your query - - Semantic similarity using embeddings +2. **Contextual Score**: Relevance to your query using BM25 algorithm - Keyword matching in link text and title - Meta description analysis - Content preview scoring -3. **Total Score**: Weighted combination for final ranking +3. **Total Score**: Combined score for final ranking **Expected Real-World Impact:** - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links @@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links ### Technical Architecture ```python +import asyncio from crawl4ai import AsyncUrlSeeder, SeedingConfig -# Basic discovery - find all product pages -seeder_config = SeedingConfig( - # Discovery sources - source="sitemap+cc", # Sitemap + Common Crawl - - # Filtering - pattern="*/product/*", # URL pattern matching - ignore_patterns=["*/reviews/*", "*/questions/*"], - - # Validation - live_check=True, # Verify URLs are alive - max_urls=5000, # Stop at 5000 URLs - - # Performance - concurrency=100, # Parallel requests - hits_per_sec=10 # Rate limiting -) +async def main(): + async with AsyncUrlSeeder() as seeder: + # Discover Python tutorial URLs + config = SeedingConfig( + source="sitemap", # Use sitemap + pattern="*python*", # URL pattern filter + extract_head=True, # Get metadata + query="python tutorial", # For relevance scoring + scoring_method="bm25", + score_threshold=0.2, + max_urls=10 + ) + + print("Discovering Python async tutorial URLs...") + urls = await seeder.urls("https://www.geeksforgeeks.org/", config) + + print(f"\n✅ Found {len(urls)} relevant URLs:") + for i, url_info in enumerate(urls[:5], 1): + print(f"\n{i}. {url_info['url']}") + if url_info.get('relevance_score'): + print(f" Relevance: {url_info['relevance_score']:.3f}") + if url_info.get('head_data', {}).get('title'): + print(f" Title: {url_info['head_data']['title'][:60]}...") -seeder = AsyncUrlSeeder(seeder_config) -urls = await seeder.discover("https://shop.example.com") - -# Advanced: Relevance-based discovery -research_config = SeedingConfig( - source="crawl+sitemap", # Deep crawl + sitemap - pattern="*/blog/*", # Blog posts only - - # Content relevance - extract_head=True, # Get meta tags - query="quantum computing tutorials", - scoring_method="bm25", # Or "semantic" (coming soon) - score_threshold=0.4, # High relevance only - - # Smart filtering - filter_nonsense_urls=True, # Remove .xml, .txt, etc. - min_content_length=500, # Skip thin content - - force=True # Bypass cache -) - -# Discover with progress tracking -discovered = [] -async for batch in seeder.discover_iter("https://physics-blog.com", research_config): - discovered.extend(batch) - print(f"Found {len(discovered)} relevant URLs so far...") - -# Results include scores and metadata -for url_data in discovered[:5]: - print(f"URL: {url_data['url']}") - print(f"Score: {url_data['score']:.3f}") - print(f"Title: {url_data['title']}") +asyncio.run(main()) ``` **Discovery Methods:** @@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res ### What We Optimized ```python -# Before v0.7.0 (slow) +# Optimized crawling with v0.7.0 improvements results = [] for url in urls: - result = await crawler.arun(url) - results.append(result) - -# After v0.7.0 (fast) -# Automatic batching and connection pooling -results = await crawler.arun_batch( - urls, - config=CrawlerRunConfig( - # New performance options - batch_size=10, # Process 10 URLs concurrently - reuse_browser=True, # Keep browser warm - eager_loading=False, # Load only what's needed - streaming_extraction=True, # Stream large extractions - - # Optimized defaults - wait_until="domcontentloaded", # Faster than networkidle - exclude_external_resources=True, # Skip third-party assets - block_ads=True # Ad blocking built-in + result = await crawler.arun( + url, + config=CrawlerRunConfig( + # Performance optimizations + wait_until="domcontentloaded", # Faster than networkidle + cache_mode=CacheMode.ENABLED # Enable caching + ) ) -) - -# Memory-efficient streaming for large crawls -async for result in crawler.arun_stream(large_url_list): - # Process results as they complete - await process_result(result) - # Memory is freed after each iteration + results.append(result) ``` **Performance Gains:** @@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list): - **Memory Usage**: 60% reduction with streaming processing - **Concurrent Crawls**: Handle 5x more parallel requests -## 📄 PDF Support - -PDF extraction is now natively supported in Crawl4AI. - -```python -# Extract data from PDF documents -result = await crawler.arun( - "https://example.com/report.pdf", - config=CrawlerRunConfig( - pdf_extraction=True, - extraction_strategy=JsonCssExtractionStrategy({ - # Works on converted PDF structure - "title": {"selector": "h1", "type": "text"}, - "sections": {"selector": "h2", "type": "list"} - }) - ) -) -``` ## 🔧 Important Changes diff --git a/docs/blog/release-v0.7.1.md b/docs/blog/release-v0.7.1.md new file mode 100644 index 00000000..d5bfdaec --- /dev/null +++ b/docs/blog/release-v0.7.1.md @@ -0,0 +1,43 @@ +# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update + +*July 17, 2025 • 2 min read* + +--- + +A small maintenance release that removes unused code and improves documentation. + +## 🎯 What's Changed + +- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py` +- **Updated documentation** with better examples and parameter explanations +- **Fixed virtual scroll configuration** examples in docs + +## 🧹 Code Cleanup + +Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead. + +```python +# Removed unused code: +from playwright_stealth import StealthConfig +stealth_config = StealthConfig(...) # This was never used +``` + +## 📖 Documentation Updates + +- Fixed adaptive crawling parameter examples +- Updated session management documentation +- Corrected virtual scroll configuration examples + +## 🚀 Installation + +```bash +pip install crawl4ai==0.7.1 +``` + +No breaking changes - upgrade directly from v0.7.0. + +--- + +Questions? Issues? +- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN) \ No newline at end of file diff --git a/docs/examples/link_head_extraction_example.py b/docs/examples/link_head_extraction_example.py index ef146d95..500566ab 100644 --- a/docs/examples/link_head_extraction_example.py +++ b/docs/examples/link_head_extraction_example.py @@ -18,7 +18,7 @@ Usage: import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig async def basic_link_head_extraction(): diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index d63b1e80..1007a605 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy from crawl4ai.cache_context import CacheMode async def crawl_dynamic_content(): - async with AsyncWebCrawler() as crawler: - session_id = "github_commits_session" - url = "https://github.com/microsoft/TypeScript/commits/main" - all_commits = [] + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "wait_for_session" + all_commits = [] - # Define extraction schema - schema = { - "name": "Commit Extractor", - "baseSelector": "li.Box-sc-g0xbh4-0", - "fields": [{ - "name": "title", "selector": "h4.markdown-title", "type": "text" - }], - } - extraction_strategy = JsonCssExtractionStrategy(schema) + js_next_page = """ + const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4'); + if (commits.length > 0) { + window.lastCommit = commits[0].textContent.trim(); + } + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) {button.click(); console.log('button clicked') } + """ - # JavaScript and wait configurations - js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();""" - wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0""" - - # Crawl multiple pages + wait_for = """() => { + const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4'); + if (commits.length === 0) return false; + const firstCommit = commits[0].textContent.trim(); + return firstCommit !== window.lastCommit; + }""" + + schema = { + "name": "Commit Extractor", + "baseSelector": "li[data-testid='commit-row-item']", + "fields": [ + { + "name": "title", + "selector": "h4 a", + "type": "text", + "transform": "strip", + }, + ], + } + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + + browser_config = BrowserConfig( + verbose=True, + headless=False, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: for page in range(3): - config = CrawlerRunConfig( - url=url, + crawler_config = CrawlerRunConfig( session_id=session_id, + css_selector="li[data-testid='commit-row-item']", extraction_strategy=extraction_strategy, js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - cache_mode=CacheMode.BYPASS + cache_mode=CacheMode.BYPASS, + capture_console_messages=True, ) - - result = await crawler.arun(config=config) - if result.success: + + result = await crawler.arun(url=url, config=crawler_config) + + if result.console_messages: + print(f"Page {page + 1} console messages:", result.console_messages) + + if result.extracted_content: + # print(f"Page {page + 1} result:", result.extracted_content) commits = json.loads(result.extracted_content) all_commits.extend(commits) print(f"Page {page + 1}: Found {len(commits)} commits") + else: + print(f"Page {page + 1}: No content extracted") + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") # Clean up session await crawler.crawler_strategy.kill_session(session_id) - return all_commits ``` --- diff --git a/docs/md_v2/advanced/virtual-scroll.md b/docs/md_v2/advanced/virtual-scroll.md index 0b1a8f88..271a9564 100644 --- a/docs/md_v2/advanced/virtual-scroll.md +++ b/docs/md_v2/advanced/virtual-scroll.md @@ -91,13 +91,12 @@ async def crawl_twitter_timeline(): wait_after_scroll=1.0 # Twitter needs time to load ) + browser_config = BrowserConfig(headless=True) # Set to False to watch it work config = CrawlerRunConfig( - virtual_scroll_config=virtual_config, - # Optional: Set headless=False to watch it work - # browser_config=BrowserConfig(headless=False) + virtual_scroll_config=virtual_config ) - async with AsyncWebCrawler() as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://twitter.com/search?q=AI", config=config @@ -200,7 +199,7 @@ Use **scan_full_page** when: Virtual Scroll works seamlessly with extraction strategies: ```python -from crawl4ai import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy, LLMConfig # Define extraction schema schema = { @@ -222,7 +221,7 @@ config = CrawlerRunConfig( scroll_count=20 ), extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o-mini", + llm_config=LLMConfig(provider="openai/gpt-4o-mini"), schema=schema ) ) diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 4ae9a689..1b474a99 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages -- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization +- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering -- **PDF Parsing**: Extract data from PDF documents - **Performance Optimizations**: Significant speed and memory improvements ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning @@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: - Extraction confidence scores ```python -from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState +from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig +import asyncio -# Initialize with custom learning parameters -config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to use learned patterns - max_history=100, # Remember last 100 crawls per domain - learning_rate=0.2, # How quickly to adapt to changes - patterns_per_page=3, # Patterns to learn per page type - extraction_strategy='css' # 'css' or 'xpath' -) - -adaptive_crawler = AdaptiveCrawler(config) - -# First crawl - crawler learns the structure -async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://news.example.com/article/12345", - config=CrawlerRunConfig( - adaptive_config=config, - extraction_hints={ # Optional hints to speed up learning - "title": "article h1", - "content": "article .body-content" - } - ) +async def main(): + + # Configure adaptive crawler + config = AdaptiveConfig( + strategy="statistical", # or "embedding" for semantic understanding + max_pages=10, + confidence_threshold=0.7, # Stop at 70% confidence + top_k_links=3, # Follow top 3 links per page + min_gain_threshold=0.05 # Need 5% information gain to continue ) - # Crawler identifies and stores patterns - if result.success: - state = adaptive_crawler.get_state("news.example.com") - print(f"Learned {len(state.patterns)} patterns") - print(f"Confidence: {state.avg_confidence:.2%}") + async with AsyncWebCrawler(verbose=False) as crawler: + adaptive = AdaptiveCrawler(crawler, config) + + print("Starting adaptive crawl about Python decorators...") + result = await adaptive.digest( + start_url="https://docs.python.org/3/glossary.html", + query="python decorators functions wrapping" + ) + + print(f"\n✅ Crawling Complete!") + print(f"• Confidence Level: {adaptive.confidence:.0%}") + print(f"• Pages Crawled: {len(result.crawled_urls)}") + print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents") + + # Get most relevant content + relevant = adaptive.get_relevant_content(top_k=3) + print(f"\nMost Relevant Pages:") + for i, page in enumerate(relevant, 1): + print(f"{i}. {page['url']} (relevance: {page['score']:.2%})") -# Subsequent crawls - uses learned patterns -result2 = await crawler.arun( - "https://news.example.com/article/67890", - config=CrawlerRunConfig(adaptive_config=config) -) -# Automatically extracts using learned patterns! +asyncio.run(main()) ``` **Expected Real-World Impact:** @@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig( container_selector="[data-testid='primaryColumn']", scroll_count=20, # Number of scrolls scroll_by="container_height", # Smart scrolling by container size - wait_after_scroll=1.0, # Let content load - capture_method="incremental", # Capture new content on each scroll - deduplicate=True # Remove duplicate elements + wait_after_scroll=1.0 # Let content load ) # For e-commerce product grids (Instagram style) @@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig( container_selector="main .product-grid", scroll_count=30, scroll_by=800, # Fixed pixel scrolling - wait_after_scroll=1.5, # Images need time - stop_on_no_change=True # Smart stopping + wait_after_scroll=1.5 # Images need time ) # For news feeds with lazy loading @@ -111,9 +104,7 @@ news_config = VirtualScrollConfig( container_selector=".article-feed", scroll_count=50, scroll_by="page_height", # Viewport-based scrolling - wait_after_scroll=0.5, - wait_for_selector=".article-card", # Wait for specific elements - timeout=30000 # Max 30 seconds total + wait_after_scroll=0.5 # Wait for content to load ) # Use it in your crawl @@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler: **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals. -### The Three-Layer Scoring System +### Intelligent Link Analysis and Scoring ```python -from crawl4ai import LinkPreviewConfig +import asyncio +from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler +from crawl4ai.adaptive_crawler import LinkPreviewConfig -# Configure intelligent link analysis -link_config = LinkPreviewConfig( - # What to analyze - include_internal=True, - include_external=True, - max_links=100, # Analyze top 100 links - - # Relevance scoring - query="machine learning tutorials", # Your interest - score_threshold=0.3, # Minimum relevance score - - # Performance - concurrent_requests=10, # Parallel processing - timeout_per_link=5000, # 5s per link - - # Advanced scoring weights - scoring_weights={ - "intrinsic": 0.3, # Link quality indicators - "contextual": 0.5, # Relevance to query - "popularity": 0.2 # Link prominence - } -) - -# Use in your crawl -result = await crawler.arun( - "https://tech-blog.example.com", - config=CrawlerRunConfig( - link_preview_config=link_config, - score_links=True +async def main(): + # Configure intelligent link analysis + link_config = LinkPreviewConfig( + include_internal=True, + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) -) + # Use in your crawl + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://www.geeksforgeeks.org/", + config=CrawlerRunConfig( + link_preview_config=link_config, + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS + ) + ) -# Access scored and sorted links -for link in result.links["internal"][:10]: # Top 10 internal links - print(f"Score: {link['total_score']:.3f}") - print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes - print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query - print(f" URL: {link['href']}") - print(f" Title: {link['head_data']['title']}") - print(f" Description: {link['head_data']['meta']['description'][:100]}...") + # Access scored and sorted links + if result.success and result.links: + for link in result.links.get("internal", []): + text = link.get('text', 'No text')[:40] + print( + text, + f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10", + f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1", + f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000" + ) + +asyncio.run(main()) ``` **Scoring Components:** -1. **Intrinsic Score (0-10)**: Based on link quality indicators +1. **Intrinsic Score**: Based on link quality indicators - Position on page (navigation, content, footer) - Link attributes (rel, title, class names) - Anchor text quality and length - URL structure and depth -2. **Contextual Score (0-1)**: Relevance to your query - - Semantic similarity using embeddings +2. **Contextual Score**: Relevance to your query using BM25 algorithm - Keyword matching in link text and title - Meta description analysis - Content preview scoring -3. **Total Score**: Weighted combination for final ranking +3. **Total Score**: Combined score for final ranking **Expected Real-World Impact:** - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links @@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links ### Technical Architecture ```python +import asyncio from crawl4ai import AsyncUrlSeeder, SeedingConfig -# Basic discovery - find all product pages -seeder_config = SeedingConfig( - # Discovery sources - source="sitemap+cc", # Sitemap + Common Crawl - - # Filtering - pattern="*/product/*", # URL pattern matching - ignore_patterns=["*/reviews/*", "*/questions/*"], - - # Validation - live_check=True, # Verify URLs are alive - max_urls=5000, # Stop at 5000 URLs - - # Performance - concurrency=100, # Parallel requests - hits_per_sec=10 # Rate limiting -) +async def main(): + async with AsyncUrlSeeder() as seeder: + # Discover Python tutorial URLs + config = SeedingConfig( + source="sitemap", # Use sitemap + pattern="*python*", # URL pattern filter + extract_head=True, # Get metadata + query="python tutorial", # For relevance scoring + scoring_method="bm25", + score_threshold=0.2, + max_urls=10 + ) + + print("Discovering Python async tutorial URLs...") + urls = await seeder.urls("https://www.geeksforgeeks.org/", config) + + print(f"\n✅ Found {len(urls)} relevant URLs:") + for i, url_info in enumerate(urls[:5], 1): + print(f"\n{i}. {url_info['url']}") + if url_info.get('relevance_score'): + print(f" Relevance: {url_info['relevance_score']:.3f}") + if url_info.get('head_data', {}).get('title'): + print(f" Title: {url_info['head_data']['title'][:60]}...") -seeder = AsyncUrlSeeder(seeder_config) -urls = await seeder.discover("https://shop.example.com") - -# Advanced: Relevance-based discovery -research_config = SeedingConfig( - source="crawl+sitemap", # Deep crawl + sitemap - pattern="*/blog/*", # Blog posts only - - # Content relevance - extract_head=True, # Get meta tags - query="quantum computing tutorials", - scoring_method="bm25", # Or "semantic" (coming soon) - score_threshold=0.4, # High relevance only - - # Smart filtering - filter_nonsense_urls=True, # Remove .xml, .txt, etc. - min_content_length=500, # Skip thin content - - force=True # Bypass cache -) - -# Discover with progress tracking -discovered = [] -async for batch in seeder.discover_iter("https://physics-blog.com", research_config): - discovered.extend(batch) - print(f"Found {len(discovered)} relevant URLs so far...") - -# Results include scores and metadata -for url_data in discovered[:5]: - print(f"URL: {url_data['url']}") - print(f"Score: {url_data['score']:.3f}") - print(f"Title: {url_data['title']}") +asyncio.run(main()) ``` **Discovery Methods:** @@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res ### What We Optimized ```python -# Before v0.7.0 (slow) +# Optimized crawling with v0.7.0 improvements results = [] for url in urls: - result = await crawler.arun(url) - results.append(result) - -# After v0.7.0 (fast) -# Automatic batching and connection pooling -results = await crawler.arun_batch( - urls, - config=CrawlerRunConfig( - # New performance options - batch_size=10, # Process 10 URLs concurrently - reuse_browser=True, # Keep browser warm - eager_loading=False, # Load only what's needed - streaming_extraction=True, # Stream large extractions - - # Optimized defaults - wait_until="domcontentloaded", # Faster than networkidle - exclude_external_resources=True, # Skip third-party assets - block_ads=True # Ad blocking built-in + result = await crawler.arun( + url, + config=CrawlerRunConfig( + # Performance optimizations + wait_until="domcontentloaded", # Faster than networkidle + cache_mode=CacheMode.ENABLED # Enable caching + ) ) -) - -# Memory-efficient streaming for large crawls -async for result in crawler.arun_stream(large_url_list): - # Process results as they complete - await process_result(result) - # Memory is freed after each iteration + results.append(result) ``` **Performance Gains:** @@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list): - **Memory Usage**: 60% reduction with streaming processing - **Concurrent Crawls**: Handle 5x more parallel requests -## 📄 PDF Support - -PDF extraction is now natively supported in Crawl4AI. - -```python -# Extract data from PDF documents -result = await crawler.arun( - "https://example.com/report.pdf", - config=CrawlerRunConfig( - pdf_extraction=True, - extraction_strategy=JsonCssExtractionStrategy({ - # Works on converted PDF structure - "title": {"selector": "h1", "type": "text"}, - "sections": {"selector": "h2", "type": "list"} - }) - ) -) -``` ## 🔧 Important Changes diff --git a/docs/md_v2/core/adaptive-crawling.md b/docs/md_v2/core/adaptive-crawling.md index 72e11937..ea1674c2 100644 --- a/docs/md_v2/core/adaptive-crawling.md +++ b/docs/md_v2/core/adaptive-crawling.md @@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler async def main(): async with AsyncWebCrawler() as crawler: - # Create an adaptive crawler + # Create an adaptive crawler (config is optional) adaptive = AdaptiveCrawler(crawler) # Start crawling with a query @@ -59,13 +59,13 @@ async def main(): from crawl4ai import AdaptiveConfig config = AdaptiveConfig( - confidence_threshold=0.7, # Stop when 70% confident (default: 0.8) - max_pages=20, # Maximum pages to crawl (default: 50) - top_k_links=3, # Links to follow per page (default: 5) + confidence_threshold=0.8, # Stop when 80% confident (default: 0.7) + max_pages=30, # Maximum pages to crawl (default: 20) + top_k_links=5, # Links to follow per page (default: 3) min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1) ) -adaptive = AdaptiveCrawler(crawler, config=config) +adaptive = AdaptiveCrawler(crawler, config) ``` ## Crawling Strategies @@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False): The confidence score (0-1) indicates how sufficient the gathered information is: - **0.0-0.3**: Insufficient information, needs more crawling - **0.3-0.6**: Partial information, may answer basic queries -- **0.6-0.8**: Good coverage, can answer most queries -- **0.8-1.0**: Excellent coverage, comprehensive information +- **0.6-0.7**: Good coverage, can answer most queries +- **0.7-1.0**: Excellent coverage, comprehensive information ### Statistics Display @@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl") - Avoid overly broad queries ### 2. Threshold Tuning -- Start with default (0.8) for general use -- Lower to 0.6-0.7 for exploratory crawling -- Raise to 0.9+ for exhaustive coverage +- Start with default (0.7) for general use +- Lower to 0.5-0.6 for exploratory crawling +- Raise to 0.8+ for exhaustive coverage ### 3. Performance Optimization - Use appropriate `max_pages` limits diff --git a/docs/md_v2/core/link-media.md b/docs/md_v2/core/link-media.md index f6305ccc..ebce30bd 100644 --- a/docs/md_v2/core/link-media.md +++ b/docs/md_v2/core/link-media.md @@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately: ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig async def extract_link_heads_example(): """ @@ -237,7 +237,7 @@ if __name__ == "__main__": The `LinkPreviewConfig` class supports these options: ```python -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig link_preview_config = LinkPreviewConfig( # BASIC SETTINGS diff --git a/docs/md_v2/core/url-seeding.md b/docs/md_v2/core/url-seeding.md index 24cdfa46..f891c204 100644 --- a/docs/md_v2/core/url-seeding.md +++ b/docs/md_v2/core/url-seeding.md @@ -137,7 +137,7 @@ async def smart_blog_crawler(): word_count_threshold=300 # Only substantial articles ) - # Extract URLs and stream results as they come + # Extract URLs and crawl them tutorial_urls = [t["url"] for t in tutorials[:10]] results = await crawler.arun_many(tutorial_urls, config=config) @@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I ```python # Use both sources -config = SeedingConfig(source="cc+sitemap") +config = SeedingConfig(source="sitemap+cc") urls = await seeder.urls("example.com", config) ``` @@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" | +| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" | | `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") | | `extract_head` | bool | False | Extract metadata from page `` | | `live_check` | bool | False | Verify URLs are accessible | | `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) | | `concurrency` | int | 10 | Parallel workers for fetching | -| `hits_per_sec` | int | None | Rate limit for requests | +| `hits_per_sec` | int | 5 | Rate limit for requests | | `force` | bool | False | Bypass cache, fetch fresh data | | `verbose` | bool | False | Show detailed progress | | `query` | str | None | Search query for BM25 scoring | @@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config) ```python # Find specific products config = SeedingConfig( - source="cc+sitemap", # Use both sources + source="sitemap+cc", # Use both sources extract_head=True, query="wireless headphones noise canceling", scoring_method="bm25", @@ -782,7 +782,7 @@ class ResearchAssistant: # Step 1: Discover relevant URLs config = SeedingConfig( - source="cc+sitemap", # Maximum coverage + source="sitemap+cc", # Maximum coverage extract_head=True, # Get metadata query=topic, # Research topic scoring_method="bm25", # Smart scoring @@ -832,7 +832,8 @@ class ResearchAssistant: # Extract URLs and crawl all articles article_urls = [article['url'] for article in top_articles] results = [] - async for result in await crawler.arun_many(article_urls, config=config): + crawl_results = await crawler.arun_many(article_urls, config=config) + async for result in crawl_results: if result.success: results.append({ 'url': result.url, @@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5) # When crawling many URLs async with AsyncWebCrawler() as crawler: # Assuming urls is a list of URL strings - results = await crawler.arun_many(urls, config=config) + crawl_results = await crawler.arun_many(urls, config=config) # Process as they arrive - async for result in results: + async for result in crawl_results: process_immediately(result) # Don't wait for all ``` @@ -1020,7 +1021,7 @@ config = SeedingConfig( # E-commerce product discovery config = SeedingConfig( - source="cc+sitemap", + source="sitemap+cc", pattern="*/product/*", extract_head=True, live_check=True diff --git a/docs/releases_review/crawl4ai_v0_7_0_showcase.py b/docs/releases_review/crawl4ai_v0_7_0_showcase.py index eca03d04..29c056f0 100644 --- a/docs/releases_review/crawl4ai_v0_7_0_showcase.py +++ b/docs/releases_review/crawl4ai_v0_7_0_showcase.py @@ -28,7 +28,7 @@ from rich import box from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode from crawl4ai import AsyncUrlSeeder, SeedingConfig -from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig +from crawl4ai import LinkPreviewConfig, VirtualScrollConfig from crawl4ai import c4a_compile, CompilationResult # Initialize Rich console for beautiful output diff --git a/docs/releases_review/v0_7_0_features_demo.py b/docs/releases_review/v0_7_0_features_demo.py index 5a68ff48..2f803a3b 100644 --- a/docs/releases_review/v0_7_0_features_demo.py +++ b/docs/releases_review/v0_7_0_features_demo.py @@ -13,14 +13,13 @@ from crawl4ai import ( BrowserConfig, CacheMode, # New imports for v0.7.0 - LinkPreviewConfig, VirtualScrollConfig, + LinkPreviewConfig, AdaptiveCrawler, AdaptiveConfig, AsyncUrlSeeder, SeedingConfig, c4a_compile, - CompilationResult ) @@ -170,16 +169,16 @@ async def demo_url_seeder(): # Discover Python tutorial URLs config = SeedingConfig( source="sitemap", # Use sitemap - pattern="*tutorial*", # URL pattern filter + pattern="*python*", # URL pattern filter extract_head=True, # Get metadata - query="python async programming", # For relevance scoring + query="python tutorial", # For relevance scoring scoring_method="bm25", score_threshold=0.2, max_urls=10 ) print("Discovering Python async tutorial URLs...") - urls = await seeder.urls("docs.python.org", config) + urls = await seeder.urls("https://www.geeksforgeeks.org/", config) print(f"\n✅ Found {len(urls)} relevant URLs:") for i, url_info in enumerate(urls[:5], 1): @@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]` print(f"❌ Compilation error: {result.first_error.message}") -async def demo_pdf_support(): - """ - Demo 6: PDF Parsing Support - - Shows how to extract content from PDF files. - Note: Requires 'pip install crawl4ai[pdf]' - """ - print("\n" + "="*60) - print("📄 DEMO 6: PDF Parsing Support") - print("="*60) - - try: - # Check if PDF support is installed - import PyPDF2 - - # Example: Process a PDF URL - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - pdf=True, # Enable PDF generation - extract_text_from_pdf=True # Extract text content - ) - - print("PDF parsing is available!") - print("You can now crawl PDF URLs and extract their content.") - print("\nExample usage:") - print(' result = await crawler.arun("https://example.com/document.pdf")') - print(' pdf_text = result.extracted_content # Contains extracted text') - - except ImportError: - print("⚠️ PDF support not installed.") - print("Install with: pip install crawl4ai[pdf]") - - async def main(): """Run all demos""" print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations") @@ -289,7 +255,6 @@ async def main(): ("Virtual Scroll", demo_virtual_scroll), ("URL Seeder", demo_url_seeder), ("C4A Script", demo_c4a_script), - ("PDF Support", demo_pdf_support) ] for name, demo_func in demos: @@ -309,7 +274,6 @@ async def main(): print("• Virtual Scroll: Capture all content from modern web pages") print("• URL Seeder: Pre-discover and filter URLs efficiently") print("• C4A Script: Simple language for complex automations") - print("• PDF Support: Extract content from PDF documents") if __name__ == "__main__": diff --git a/tests/docker/simple_api_test.py b/tests/docker/simple_api_test.py new file mode 100644 index 00000000..0a966d5e --- /dev/null +++ b/tests/docker/simple_api_test.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +""" +Simple API Test for Crawl4AI Docker Server v0.7.0 +Uses only built-in Python modules to test all endpoints. +""" + +import urllib.request +import urllib.parse +import json +import time +import sys +from typing import Dict, List, Optional + +# Configuration +BASE_URL = "http://localhost:11234" # Change to your server URL +TEST_TIMEOUT = 30 + +class SimpleApiTester: + def __init__(self, base_url: str = BASE_URL): + self.base_url = base_url + self.token = None + self.results = [] + + def log(self, message: str): + print(f"[INFO] {message}") + + def test_get_endpoint(self, endpoint: str) -> Dict: + """Test a GET endpoint""" + url = f"{self.base_url}{endpoint}" + start_time = time.time() + + try: + req = urllib.request.Request(url) + if self.token: + req.add_header('Authorization', f'Bearer {self.token}') + + with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response: + response_time = time.time() - start_time + status_code = response.getcode() + content = response.read().decode('utf-8') + + # Try to parse JSON + try: + data = json.loads(content) + except: + data = {"raw_response": content[:200]} + + return { + "endpoint": endpoint, + "method": "GET", + "status": "PASS" if status_code < 400 else "FAIL", + "status_code": status_code, + "response_time": response_time, + "data": data + } + except Exception as e: + response_time = time.time() - start_time + return { + "endpoint": endpoint, + "method": "GET", + "status": "FAIL", + "status_code": None, + "response_time": response_time, + "error": str(e) + } + + def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict: + """Test a POST endpoint""" + url = f"{self.base_url}{endpoint}" + start_time = time.time() + + try: + data = json.dumps(payload).encode('utf-8') + req = urllib.request.Request(url, data=data, method='POST') + req.add_header('Content-Type', 'application/json') + + if self.token: + req.add_header('Authorization', f'Bearer {self.token}') + + with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response: + response_time = time.time() - start_time + status_code = response.getcode() + content = response.read().decode('utf-8') + + # Try to parse JSON + try: + data = json.loads(content) + except: + data = {"raw_response": content[:200]} + + return { + "endpoint": endpoint, + "method": "POST", + "status": "PASS" if status_code < 400 else "FAIL", + "status_code": status_code, + "response_time": response_time, + "data": data + } + except Exception as e: + response_time = time.time() - start_time + return { + "endpoint": endpoint, + "method": "POST", + "status": "FAIL", + "status_code": None, + "response_time": response_time, + "error": str(e) + } + + def print_result(self, result: Dict): + """Print a formatted test result""" + status_color = { + "PASS": "✅", + "FAIL": "❌", + "SKIP": "⏭️" + } + + print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} " + f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}") + + if result['status'] == 'FAIL' and 'error' in result: + print(f" Error: {result['error']}") + + self.results.append(result) + + def run_all_tests(self): + """Run all API tests""" + print("🚀 Starting Crawl4AI v0.7.0 API Test Suite") + print(f"📡 Testing server at: {self.base_url}") + print("=" * 60) + + # # Test basic endpoints + # print("\n=== BASIC ENDPOINTS ===") + + # # Health check + # result = self.test_get_endpoint("/health") + # self.print_result(result) + + + # # Schema endpoint + # result = self.test_get_endpoint("/schema") + # self.print_result(result) + + # # Metrics endpoint + # result = self.test_get_endpoint("/metrics") + # self.print_result(result) + + # # Root redirect + # result = self.test_get_endpoint("/") + # self.print_result(result) + + # # Test authentication + # print("\n=== AUTHENTICATION ===") + + # # Get token + # token_payload = {"email": "test@example.com"} + # result = self.test_post_endpoint("/token", token_payload) + # self.print_result(result) + + # # Extract token if successful + # if result['status'] == 'PASS' and 'data' in result: + # token = result['data'].get('access_token') + # if token: + # self.token = token + # self.log(f"Successfully obtained auth token: {token[:20]}...") + + # Test core APIs + print("\n=== CORE APIs ===") + + test_url = "https://example.com" + + # Test markdown endpoint + md_payload = { + "url": test_url, + "f": "fit", + "q": "test query", + "c": "0" + } + result = self.test_post_endpoint("/md", md_payload) + # print(result['data'].get('markdown', '')) + self.print_result(result) + + # Test HTML endpoint + html_payload = {"url": test_url} + result = self.test_post_endpoint("/html", html_payload) + self.print_result(result) + + # Test screenshot endpoint + screenshot_payload = { + "url": test_url, + "screenshot_wait_for": 2 + } + result = self.test_post_endpoint("/screenshot", screenshot_payload) + self.print_result(result) + + # Test PDF endpoint + pdf_payload = {"url": test_url} + result = self.test_post_endpoint("/pdf", pdf_payload) + self.print_result(result) + + # Test JavaScript execution + js_payload = { + "url": test_url, + "scripts": ["(() => document.title)()"] + } + result = self.test_post_endpoint("/execute_js", js_payload) + self.print_result(result) + + # Test crawl endpoint + crawl_payload = { + "urls": [test_url], + "browser_config": {}, + "crawler_config": {} + } + result = self.test_post_endpoint("/crawl", crawl_payload) + self.print_result(result) + + # Test config dump + config_payload = {"code": "CrawlerRunConfig()"} + result = self.test_post_endpoint("/config/dump", config_payload) + self.print_result(result) + + # Test LLM endpoint + llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content" + result = self.test_get_endpoint(llm_endpoint) + self.print_result(result) + + # Test ask endpoint + ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5" + result = self.test_get_endpoint(ask_endpoint) + print(result) + self.print_result(result) + + # Test job APIs + print("\n=== JOB APIs ===") + + # Test LLM job + llm_job_payload = { + "url": test_url, + "q": "Extract main content", + "cache": False + } + result = self.test_post_endpoint("/llm/job", llm_job_payload) + self.print_result(result) + + # Test crawl job + crawl_job_payload = { + "urls": [test_url], + "browser_config": {}, + "crawler_config": {} + } + result = self.test_post_endpoint("/crawl/job", crawl_job_payload) + self.print_result(result) + + # Test MCP + print("\n=== MCP APIs ===") + + # Test MCP schema + result = self.test_get_endpoint("/mcp/schema") + self.print_result(result) + + # Test error handling + print("\n=== ERROR HANDLING ===") + + # Test invalid URL + invalid_payload = {"url": "invalid-url", "f": "fit"} + result = self.test_post_endpoint("/md", invalid_payload) + self.print_result(result) + + # Test invalid endpoint + result = self.test_get_endpoint("/nonexistent") + self.print_result(result) + + # Print summary + self.print_summary() + + def print_summary(self): + """Print test results summary""" + print("\n" + "=" * 60) + print("📊 TEST RESULTS SUMMARY") + print("=" * 60) + + total = len(self.results) + passed = sum(1 for r in self.results if r['status'] == 'PASS') + failed = sum(1 for r in self.results if r['status'] == 'FAIL') + + print(f"Total Tests: {total}") + print(f"✅ Passed: {passed}") + print(f"❌ Failed: {failed}") + print(f"📈 Success Rate: {(passed/total)*100:.1f}%") + + if failed > 0: + print("\n❌ FAILED TESTS:") + for result in self.results: + if result['status'] == 'FAIL': + print(f" • {result['method']} {result['endpoint']}") + if 'error' in result: + print(f" Error: {result['error']}") + + # Performance statistics + response_times = [r['response_time'] for r in self.results if r['response_time'] > 0] + if response_times: + avg_time = sum(response_times) / len(response_times) + max_time = max(response_times) + print(f"\n⏱️ Average Response Time: {avg_time:.3f}s") + print(f"⏱️ Max Response Time: {max_time:.3f}s") + + # Save detailed report + report_file = f"crawl4ai_test_report_{int(time.time())}.json" + with open(report_file, 'w') as f: + json.dump({ + "timestamp": time.time(), + "server_url": self.base_url, + "version": "0.7.0", + "summary": { + "total": total, + "passed": passed, + "failed": failed + }, + "results": self.results + }, f, indent=2) + + print(f"\n📄 Detailed report saved to: {report_file}") + +def main(): + """Main test runner""" + import argparse + + parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite') + parser.add_argument('--url', default=BASE_URL, help='Base URL of the server') + + args = parser.parse_args() + + tester = SimpleApiTester(args.url) + + try: + tester.run_all_tests() + except KeyboardInterrupt: + print("\n🛑 Test suite interrupted by user") + except Exception as e: + print(f"\n💥 Test suite failed with error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tests/test_link_extractor.py b/tests/test_link_extractor.py index 3f64d7a3..1482ce01 100644 --- a/tests/test_link_extractor.py +++ b/tests/test_link_extractor.py @@ -5,7 +5,7 @@ Test script for Link Extractor functionality from crawl4ai.models import Link from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig import asyncio import sys import os @@ -237,7 +237,7 @@ def test_config_examples(): print(f" {key}: {value}") print(" Usage:") - print(" from crawl4ai.async_configs import LinkPreviewConfig") + print(" from crawl4ai import LinkPreviewConfig") print(" config = CrawlerRunConfig(") print(" link_preview_config=LinkPreviewConfig(") for key, value in config_dict.items():