From dd5ee752cf617700ed029ddccc6186875ac121a2 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 12 Jul 2025 19:58:26 +0800 Subject: [PATCH 1/9] docs: Add missing documentation pages to mkdocs.yml - Added Adaptive Crawling to Core section - Added URL Seeding to Core section - Added Adaptive Strategies to Advanced section --- mkdocs.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index 43b3c74a..1cc65101 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,6 +25,8 @@ nav: - "Command Line Interface": "core/cli.md" - "Simple Crawling": "core/simple-crawling.md" - "Deep Crawling": "core/deep-crawling.md" + - "Adaptive Crawling": "core/adaptive-crawling.md" + - "URL Seeding": "core/url-seeding.md" - "C4A-Script": "core/c4a-script.md" - "Crawler Result": "core/crawler-result.md" - "Browser, Crawler & LLM Config": "core/browser-crawler-config.md" @@ -37,6 +39,7 @@ nav: - "Link & Media": "core/link-media.md" - Advanced: - "Overview": "advanced/advanced-features.md" + - "Adaptive Strategies": "advanced/adaptive-strategies.md" - "Virtual Scroll": "advanced/virtual-scroll.md" - "File Downloading": "advanced/file-downloading.md" - "Lazy Loading": "advanced/lazy-loading.md" From 58024755c589c899e3eac5df702e4482b705ce27 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 10:15:05 +0200 Subject: [PATCH 2/9] docs: Update adaptive crawling parameters and examples in README and release notes --- README.md | 17 +++++----- docs/blog/release-v0.7.0.md | 52 +++++++++++++------------------ docs/md_v2/blog/releases/0.7.0.md | 52 +++++++++++++------------------ 3 files changed, 52 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index b74f386e..97a907a1 100644 --- a/README.md +++ b/README.md @@ -523,15 +523,18 @@ async def test_news_crawl(): - **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically: ```python config = AdaptiveConfig( - confidence_threshold=0.7, - max_history=100, - learning_rate=0.2 + confidence_threshold=0.7, # Min confidence to stop crawling + max_depth=5, # Maximum crawl depth + max_pages=20, # Maximum number of pages to crawl + strategy="statistical" ) - result = await crawler.arun( - "https://news.example.com", - config=CrawlerRunConfig(adaptive_config=config) - ) + async with AsyncWebCrawler() as crawler: + adaptive_crawler = AdaptiveCrawler(crawler, config) + state = await adaptive_crawler.digest( + start_url="https://news.example.com", + query="latest news content" + ) # Crawler learns patterns and improves extraction over time ``` diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 4ae9a689..56fb4914 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: - Extraction confidence scores ```python -from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState +from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig -# Initialize with custom learning parameters +# Initialize with custom adaptive parameters config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to use learned patterns - max_history=100, # Remember last 100 crawls per domain - learning_rate=0.2, # How quickly to adapt to changes - patterns_per_page=3, # Patterns to learn per page type - extraction_strategy='css' # 'css' or 'xpath' + confidence_threshold=0.7, # Min confidence to stop crawling + max_depth=5, # Maximum crawl depth + max_pages=20, # Maximum number of pages to crawl + top_k_links=3, # Number of top links to follow per page + strategy="statistical", # 'statistical' or 'embedding' + coverage_weight=0.4, # Weight for coverage in confidence calculation + consistency_weight=0.3, # Weight for consistency in confidence calculation + saturation_weight=0.3 # Weight for saturation in confidence calculation ) -adaptive_crawler = AdaptiveCrawler(config) - -# First crawl - crawler learns the structure +# Initialize adaptive crawler with web crawler async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://news.example.com/article/12345", - config=CrawlerRunConfig( - adaptive_config=config, - extraction_hints={ # Optional hints to speed up learning - "title": "article h1", - "content": "article .body-content" - } - ) + adaptive_crawler = AdaptiveCrawler(crawler, config) + + # Crawl and learn patterns + state = await adaptive_crawler.digest( + start_url="https://news.example.com/article/12345", + query="latest news articles and content" ) - # Crawler identifies and stores patterns - if result.success: - state = adaptive_crawler.get_state("news.example.com") - print(f"Learned {len(state.patterns)} patterns") - print(f"Confidence: {state.avg_confidence:.2%}") - -# Subsequent crawls - uses learned patterns -result2 = await crawler.arun( - "https://news.example.com/article/67890", - config=CrawlerRunConfig(adaptive_config=config) -) -# Automatically extracts using learned patterns! + # Access results and confidence + print(f"Confidence Level: {adaptive_crawler.confidence:.0%}") + print(f"Pages Crawled: {len(state.crawled_urls)}") + print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents") ``` **Expected Real-World Impact:** diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 4ae9a689..56fb4914 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: - Extraction confidence scores ```python -from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState +from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig -# Initialize with custom learning parameters +# Initialize with custom adaptive parameters config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to use learned patterns - max_history=100, # Remember last 100 crawls per domain - learning_rate=0.2, # How quickly to adapt to changes - patterns_per_page=3, # Patterns to learn per page type - extraction_strategy='css' # 'css' or 'xpath' + confidence_threshold=0.7, # Min confidence to stop crawling + max_depth=5, # Maximum crawl depth + max_pages=20, # Maximum number of pages to crawl + top_k_links=3, # Number of top links to follow per page + strategy="statistical", # 'statistical' or 'embedding' + coverage_weight=0.4, # Weight for coverage in confidence calculation + consistency_weight=0.3, # Weight for consistency in confidence calculation + saturation_weight=0.3 # Weight for saturation in confidence calculation ) -adaptive_crawler = AdaptiveCrawler(config) - -# First crawl - crawler learns the structure +# Initialize adaptive crawler with web crawler async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://news.example.com/article/12345", - config=CrawlerRunConfig( - adaptive_config=config, - extraction_hints={ # Optional hints to speed up learning - "title": "article h1", - "content": "article .body-content" - } - ) + adaptive_crawler = AdaptiveCrawler(crawler, config) + + # Crawl and learn patterns + state = await adaptive_crawler.digest( + start_url="https://news.example.com/article/12345", + query="latest news articles and content" ) - # Crawler identifies and stores patterns - if result.success: - state = adaptive_crawler.get_state("news.example.com") - print(f"Learned {len(state.patterns)} patterns") - print(f"Confidence: {state.avg_confidence:.2%}") - -# Subsequent crawls - uses learned patterns -result2 = await crawler.arun( - "https://news.example.com/article/67890", - config=CrawlerRunConfig(adaptive_config=config) -) -# Automatically extracts using learned patterns! + # Access results and confidence + print(f"Confidence Level: {adaptive_crawler.confidence:.0%}") + print(f"Pages Crawled: {len(state.crawled_urls)}") + print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents") ``` **Expected Real-World Impact:** From 2640dc73a59fe82f45a73662de21bd7d569ed14a Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 10:19:29 +0200 Subject: [PATCH 3/9] docs: Enhance session management example for dynamic content crawling with improved JavaScript handling and extraction schema. ref #226 --- docs/md_v2/advanced/session-management.md | 79 ++++++++++++++++------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index d63b1e80..1007a605 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy from crawl4ai.cache_context import CacheMode async def crawl_dynamic_content(): - async with AsyncWebCrawler() as crawler: - session_id = "github_commits_session" - url = "https://github.com/microsoft/TypeScript/commits/main" - all_commits = [] + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "wait_for_session" + all_commits = [] - # Define extraction schema - schema = { - "name": "Commit Extractor", - "baseSelector": "li.Box-sc-g0xbh4-0", - "fields": [{ - "name": "title", "selector": "h4.markdown-title", "type": "text" - }], - } - extraction_strategy = JsonCssExtractionStrategy(schema) + js_next_page = """ + const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4'); + if (commits.length > 0) { + window.lastCommit = commits[0].textContent.trim(); + } + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) {button.click(); console.log('button clicked') } + """ - # JavaScript and wait configurations - js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();""" - wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0""" - - # Crawl multiple pages + wait_for = """() => { + const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4'); + if (commits.length === 0) return false; + const firstCommit = commits[0].textContent.trim(); + return firstCommit !== window.lastCommit; + }""" + + schema = { + "name": "Commit Extractor", + "baseSelector": "li[data-testid='commit-row-item']", + "fields": [ + { + "name": "title", + "selector": "h4 a", + "type": "text", + "transform": "strip", + }, + ], + } + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + + browser_config = BrowserConfig( + verbose=True, + headless=False, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: for page in range(3): - config = CrawlerRunConfig( - url=url, + crawler_config = CrawlerRunConfig( session_id=session_id, + css_selector="li[data-testid='commit-row-item']", extraction_strategy=extraction_strategy, js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - cache_mode=CacheMode.BYPASS + cache_mode=CacheMode.BYPASS, + capture_console_messages=True, ) - - result = await crawler.arun(config=config) - if result.success: + + result = await crawler.arun(url=url, config=crawler_config) + + if result.console_messages: + print(f"Page {page + 1} console messages:", result.console_messages) + + if result.extracted_content: + # print(f"Page {page + 1} result:", result.extracted_content) commits = json.loads(result.extracted_content) all_commits.extend(commits) print(f"Page {page + 1}: Found {len(commits)} commits") + else: + print(f"Page {page + 1}: No content extracted") + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") # Clean up session await crawler.crawler_strategy.kill_session(session_id) - return all_commits ``` --- From 205df1e33043d87d8d3f731823a76a9173be051b Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 10:29:47 +0200 Subject: [PATCH 4/9] docs: Fix virtual scroll configuration --- docs/blog/release-v0.7.0.md | 11 +++-------- docs/md_v2/blog/releases/0.7.0.md | 11 +++-------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 56fb4914..6bbf670a 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -82,9 +82,7 @@ twitter_config = VirtualScrollConfig( container_selector="[data-testid='primaryColumn']", scroll_count=20, # Number of scrolls scroll_by="container_height", # Smart scrolling by container size - wait_after_scroll=1.0, # Let content load - capture_method="incremental", # Capture new content on each scroll - deduplicate=True # Remove duplicate elements + wait_after_scroll=1.0 # Let content load ) # For e-commerce product grids (Instagram style) @@ -92,8 +90,7 @@ grid_config = VirtualScrollConfig( container_selector="main .product-grid", scroll_count=30, scroll_by=800, # Fixed pixel scrolling - wait_after_scroll=1.5, # Images need time - stop_on_no_change=True # Smart stopping + wait_after_scroll=1.5 # Images need time ) # For news feeds with lazy loading @@ -101,9 +98,7 @@ news_config = VirtualScrollConfig( container_selector=".article-feed", scroll_count=50, scroll_by="page_height", # Viewport-based scrolling - wait_after_scroll=0.5, - wait_for_selector=".article-card", # Wait for specific elements - timeout=30000 # Max 30 seconds total + wait_after_scroll=0.5 # Wait for content to load ) # Use it in your crawl diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 56fb4914..6bbf670a 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -82,9 +82,7 @@ twitter_config = VirtualScrollConfig( container_selector="[data-testid='primaryColumn']", scroll_count=20, # Number of scrolls scroll_by="container_height", # Smart scrolling by container size - wait_after_scroll=1.0, # Let content load - capture_method="incremental", # Capture new content on each scroll - deduplicate=True # Remove duplicate elements + wait_after_scroll=1.0 # Let content load ) # For e-commerce product grids (Instagram style) @@ -92,8 +90,7 @@ grid_config = VirtualScrollConfig( container_selector="main .product-grid", scroll_count=30, scroll_by=800, # Fixed pixel scrolling - wait_after_scroll=1.5, # Images need time - stop_on_no_change=True # Smart stopping + wait_after_scroll=1.5 # Images need time ) # For news feeds with lazy loading @@ -101,9 +98,7 @@ news_config = VirtualScrollConfig( container_selector=".article-feed", scroll_count=50, scroll_by="page_height", # Viewport-based scrolling - wait_after_scroll=0.5, - wait_for_selector=".article-card", # Wait for specific elements - timeout=30000 # Max 30 seconds total + wait_after_scroll=0.5 # Wait for content to load ) # Use it in your crawl From 1d1970ae69f6f886578e7e2ca64bf1cb31a11326 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 11:32:04 +0200 Subject: [PATCH 5/9] docs: Update release notes and docs for v0.7.0 with teh correct parameters and explanations --- docs/blog/release-v0.7.0.md | 152 ++++++++++---------------- docs/md_v2/advanced/virtual-scroll.md | 11 +- docs/md_v2/blog/releases/0.7.0.md | 152 ++++++++++---------------- docs/md_v2/core/adaptive-crawling.md | 20 ++-- docs/md_v2/core/url-seeding.md | 21 ++-- 5 files changed, 146 insertions(+), 210 deletions(-) diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 6bbf670a..0772ae58 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages -- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization +- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering -- **PDF Parsing**: Extract data from PDF documents - **Performance Optimizations**: Significant speed and memory improvements ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning @@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler: ### The Three-Layer Scoring System ```python -from crawl4ai import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode # Configure intelligent link analysis link_config = LinkPreviewConfig( - # What to analyze include_internal=True, - include_external=True, - max_links=100, # Analyze top 100 links - - # Relevance scoring - query="machine learning tutorials", # Your interest - score_threshold=0.3, # Minimum relevance score - - # Performance - concurrent_requests=10, # Parallel processing - timeout_per_link=5000, # 5s per link - - # Advanced scoring weights - scoring_weights={ - "intrinsic": 0.3, # Link quality indicators - "contextual": 0.5, # Relevance to query - "popularity": 0.2 # Link prominence - } + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) # Use in your crawl @@ -175,35 +162,51 @@ result = await crawler.arun( "https://tech-blog.example.com", config=CrawlerRunConfig( link_preview_config=link_config, - score_links=True + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS ) ) # Access scored and sorted links -for link in result.links["internal"][:10]: # Top 10 internal links - print(f"Score: {link['total_score']:.3f}") - print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes - print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query - print(f" URL: {link['href']}") - print(f" Title: {link['head_data']['title']}") - print(f" Description: {link['head_data']['meta']['description'][:100]}...") +if result.success and result.links: +# Get scored links +internal_links = result.links.get("internal", []) +scored_links = [l for l in internal_links if l.get("total_score")] +scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True) + +# Create a scoring table +table = Table(title="Link Scoring Results", box=box.ROUNDED) +table.add_column("Link Text", style="cyan", width=40) +table.add_column("Intrinsic Score", justify="center") +table.add_column("Contextual Score", justify="center") +table.add_column("Total Score", justify="center", style="bold green") + +for link in scored_links[:5]: + text = link.get('text', 'No text')[:40] + table.add_row( + text, + f"{link.get('intrinsic_score', 0):.1f}/10", + f"{link.get('contextual_score', 0):.2f}/1", + f"{link.get('total_score', 0):.3f}" + ) + +console.print(table) ``` **Scoring Components:** -1. **Intrinsic Score (0-10)**: Based on link quality indicators +1. **Intrinsic Score**: Based on link quality indicators - Position on page (navigation, content, footer) - Link attributes (rel, title, class names) - Anchor text quality and length - URL structure and depth -2. **Contextual Score (0-1)**: Relevance to your query - - Semantic similarity using embeddings +2. **Contextual Score**: Relevance to your query using BM25 algorithm - Keyword matching in link text and title - Meta description analysis - Content preview scoring -3. **Total Score**: Weighted combination for final ranking +3. **Total Score**: Combined score for final ranking **Expected Real-World Impact:** - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links @@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig # Basic discovery - find all product pages seeder_config = SeedingConfig( # Discovery sources - source="sitemap+cc", # Sitemap + Common Crawl + source="cc+sitemap", # Sitemap + Common Crawl # Filtering pattern="*/product/*", # URL pattern matching - ignore_patterns=["*/reviews/*", "*/questions/*"], # Validation live_check=True, # Verify URLs are alive - max_urls=5000, # Stop at 5000 URLs + max_urls=50, # Stop at 50 URLs # Performance - concurrency=100, # Parallel requests - hits_per_sec=10 # Rate limiting + concurrency=100, # Maximum concurrent requests for live checks/head extraction + hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers ) -seeder = AsyncUrlSeeder(seeder_config) -urls = await seeder.discover("https://shop.example.com") +async with AsyncUrlSeeder() as seeder: + console.print("Discovering URLs from Python docs...") + urls = await seeder.urls("docs.python.org", seeding_config) + console.print(f"\n✓ Discovered {len(urls)} URLs") # Advanced: Relevance-based discovery research_config = SeedingConfig( - source="crawl+sitemap", # Deep crawl + sitemap + source="sitemap+cc", # Sitemap + Common Crawl pattern="*/blog/*", # Blog posts only # Content relevance extract_head=True, # Get meta tags query="quantum computing tutorials", - scoring_method="bm25", # Or "semantic" (coming soon) + scoring_method="bm25", # BM25 scoring method score_threshold=0.4, # High relevance only # Smart filtering filter_nonsense_urls=True, # Remove .xml, .txt, etc. - min_content_length=500, # Skip thin content force=True # Bypass cache ) # Discover with progress tracking discovered = [] -async for batch in seeder.discover_iter("https://physics-blog.com", research_config): - discovered.extend(batch) - print(f"Found {len(discovered)} relevant URLs so far...") +async with AsyncUrlSeeder() as seeder: + discovered = await seeder.urls("https://physics-blog.com", research_config) + console.print(f"\n✓ Discovered {len(discovered)} URLs") # Results include scores and metadata for url_data in discovered[:5]: print(f"URL: {url_data['url']}") - print(f"Score: {url_data['score']:.3f}") - print(f"Title: {url_data['title']}") + print(f"Score: {url_data['relevance_score']:.3f}") + print(f"Title: {url_data['head_data']['title']}") ``` **Discovery Methods:** @@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res ### What We Optimized ```python -# Before v0.7.0 (slow) +# Optimized crawling with v0.7.0 improvements results = [] for url in urls: - result = await crawler.arun(url) - results.append(result) - -# After v0.7.0 (fast) -# Automatic batching and connection pooling -results = await crawler.arun_batch( - urls, - config=CrawlerRunConfig( - # New performance options - batch_size=10, # Process 10 URLs concurrently - reuse_browser=True, # Keep browser warm - eager_loading=False, # Load only what's needed - streaming_extraction=True, # Stream large extractions - - # Optimized defaults - wait_until="domcontentloaded", # Faster than networkidle - exclude_external_resources=True, # Skip third-party assets - block_ads=True # Ad blocking built-in + result = await crawler.arun( + url, + config=CrawlerRunConfig( + # Performance optimizations + wait_until="domcontentloaded", # Faster than networkidle + cache_mode=CacheMode.ENABLED # Enable caching + ) ) -) - -# Memory-efficient streaming for large crawls -async for result in crawler.arun_stream(large_url_list): - # Process results as they complete - await process_result(result) - # Memory is freed after each iteration + results.append(result) ``` **Performance Gains:** @@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list): - **Memory Usage**: 60% reduction with streaming processing - **Concurrent Crawls**: Handle 5x more parallel requests -## 📄 PDF Support - -PDF extraction is now natively supported in Crawl4AI. - -```python -# Extract data from PDF documents -result = await crawler.arun( - "https://example.com/report.pdf", - config=CrawlerRunConfig( - pdf_extraction=True, - extraction_strategy=JsonCssExtractionStrategy({ - # Works on converted PDF structure - "title": {"selector": "h1", "type": "text"}, - "sections": {"selector": "h2", "type": "list"} - }) - ) -) -``` ## 🔧 Important Changes diff --git a/docs/md_v2/advanced/virtual-scroll.md b/docs/md_v2/advanced/virtual-scroll.md index 0b1a8f88..271a9564 100644 --- a/docs/md_v2/advanced/virtual-scroll.md +++ b/docs/md_v2/advanced/virtual-scroll.md @@ -91,13 +91,12 @@ async def crawl_twitter_timeline(): wait_after_scroll=1.0 # Twitter needs time to load ) + browser_config = BrowserConfig(headless=True) # Set to False to watch it work config = CrawlerRunConfig( - virtual_scroll_config=virtual_config, - # Optional: Set headless=False to watch it work - # browser_config=BrowserConfig(headless=False) + virtual_scroll_config=virtual_config ) - async with AsyncWebCrawler() as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://twitter.com/search?q=AI", config=config @@ -200,7 +199,7 @@ Use **scan_full_page** when: Virtual Scroll works seamlessly with extraction strategies: ```python -from crawl4ai import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy, LLMConfig # Define extraction schema schema = { @@ -222,7 +221,7 @@ config = CrawlerRunConfig( scroll_count=20 ), extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o-mini", + llm_config=LLMConfig(provider="openai/gpt-4o-mini"), schema=schema ) ) diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 6bbf670a..0772ae58 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages -- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization +- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering -- **PDF Parsing**: Extract data from PDF documents - **Performance Optimizations**: Significant speed and memory improvements ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning @@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler: ### The Three-Layer Scoring System ```python -from crawl4ai import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode # Configure intelligent link analysis link_config = LinkPreviewConfig( - # What to analyze include_internal=True, - include_external=True, - max_links=100, # Analyze top 100 links - - # Relevance scoring - query="machine learning tutorials", # Your interest - score_threshold=0.3, # Minimum relevance score - - # Performance - concurrent_requests=10, # Parallel processing - timeout_per_link=5000, # 5s per link - - # Advanced scoring weights - scoring_weights={ - "intrinsic": 0.3, # Link quality indicators - "contextual": 0.5, # Relevance to query - "popularity": 0.2 # Link prominence - } + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) # Use in your crawl @@ -175,35 +162,51 @@ result = await crawler.arun( "https://tech-blog.example.com", config=CrawlerRunConfig( link_preview_config=link_config, - score_links=True + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS ) ) # Access scored and sorted links -for link in result.links["internal"][:10]: # Top 10 internal links - print(f"Score: {link['total_score']:.3f}") - print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes - print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query - print(f" URL: {link['href']}") - print(f" Title: {link['head_data']['title']}") - print(f" Description: {link['head_data']['meta']['description'][:100]}...") +if result.success and result.links: +# Get scored links +internal_links = result.links.get("internal", []) +scored_links = [l for l in internal_links if l.get("total_score")] +scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True) + +# Create a scoring table +table = Table(title="Link Scoring Results", box=box.ROUNDED) +table.add_column("Link Text", style="cyan", width=40) +table.add_column("Intrinsic Score", justify="center") +table.add_column("Contextual Score", justify="center") +table.add_column("Total Score", justify="center", style="bold green") + +for link in scored_links[:5]: + text = link.get('text', 'No text')[:40] + table.add_row( + text, + f"{link.get('intrinsic_score', 0):.1f}/10", + f"{link.get('contextual_score', 0):.2f}/1", + f"{link.get('total_score', 0):.3f}" + ) + +console.print(table) ``` **Scoring Components:** -1. **Intrinsic Score (0-10)**: Based on link quality indicators +1. **Intrinsic Score**: Based on link quality indicators - Position on page (navigation, content, footer) - Link attributes (rel, title, class names) - Anchor text quality and length - URL structure and depth -2. **Contextual Score (0-1)**: Relevance to your query - - Semantic similarity using embeddings +2. **Contextual Score**: Relevance to your query using BM25 algorithm - Keyword matching in link text and title - Meta description analysis - Content preview scoring -3. **Total Score**: Weighted combination for final ranking +3. **Total Score**: Combined score for final ranking **Expected Real-World Impact:** - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links @@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig # Basic discovery - find all product pages seeder_config = SeedingConfig( # Discovery sources - source="sitemap+cc", # Sitemap + Common Crawl + source="cc+sitemap", # Sitemap + Common Crawl # Filtering pattern="*/product/*", # URL pattern matching - ignore_patterns=["*/reviews/*", "*/questions/*"], # Validation live_check=True, # Verify URLs are alive - max_urls=5000, # Stop at 5000 URLs + max_urls=50, # Stop at 50 URLs # Performance - concurrency=100, # Parallel requests - hits_per_sec=10 # Rate limiting + concurrency=100, # Maximum concurrent requests for live checks/head extraction + hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers ) -seeder = AsyncUrlSeeder(seeder_config) -urls = await seeder.discover("https://shop.example.com") +async with AsyncUrlSeeder() as seeder: + console.print("Discovering URLs from Python docs...") + urls = await seeder.urls("docs.python.org", seeding_config) + console.print(f"\n✓ Discovered {len(urls)} URLs") # Advanced: Relevance-based discovery research_config = SeedingConfig( - source="crawl+sitemap", # Deep crawl + sitemap + source="sitemap+cc", # Sitemap + Common Crawl pattern="*/blog/*", # Blog posts only # Content relevance extract_head=True, # Get meta tags query="quantum computing tutorials", - scoring_method="bm25", # Or "semantic" (coming soon) + scoring_method="bm25", # BM25 scoring method score_threshold=0.4, # High relevance only # Smart filtering filter_nonsense_urls=True, # Remove .xml, .txt, etc. - min_content_length=500, # Skip thin content force=True # Bypass cache ) # Discover with progress tracking discovered = [] -async for batch in seeder.discover_iter("https://physics-blog.com", research_config): - discovered.extend(batch) - print(f"Found {len(discovered)} relevant URLs so far...") +async with AsyncUrlSeeder() as seeder: + discovered = await seeder.urls("https://physics-blog.com", research_config) + console.print(f"\n✓ Discovered {len(discovered)} URLs") # Results include scores and metadata for url_data in discovered[:5]: print(f"URL: {url_data['url']}") - print(f"Score: {url_data['score']:.3f}") - print(f"Title: {url_data['title']}") + print(f"Score: {url_data['relevance_score']:.3f}") + print(f"Title: {url_data['head_data']['title']}") ``` **Discovery Methods:** @@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res ### What We Optimized ```python -# Before v0.7.0 (slow) +# Optimized crawling with v0.7.0 improvements results = [] for url in urls: - result = await crawler.arun(url) - results.append(result) - -# After v0.7.0 (fast) -# Automatic batching and connection pooling -results = await crawler.arun_batch( - urls, - config=CrawlerRunConfig( - # New performance options - batch_size=10, # Process 10 URLs concurrently - reuse_browser=True, # Keep browser warm - eager_loading=False, # Load only what's needed - streaming_extraction=True, # Stream large extractions - - # Optimized defaults - wait_until="domcontentloaded", # Faster than networkidle - exclude_external_resources=True, # Skip third-party assets - block_ads=True # Ad blocking built-in + result = await crawler.arun( + url, + config=CrawlerRunConfig( + # Performance optimizations + wait_until="domcontentloaded", # Faster than networkidle + cache_mode=CacheMode.ENABLED # Enable caching + ) ) -) - -# Memory-efficient streaming for large crawls -async for result in crawler.arun_stream(large_url_list): - # Process results as they complete - await process_result(result) - # Memory is freed after each iteration + results.append(result) ``` **Performance Gains:** @@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list): - **Memory Usage**: 60% reduction with streaming processing - **Concurrent Crawls**: Handle 5x more parallel requests -## 📄 PDF Support - -PDF extraction is now natively supported in Crawl4AI. - -```python -# Extract data from PDF documents -result = await crawler.arun( - "https://example.com/report.pdf", - config=CrawlerRunConfig( - pdf_extraction=True, - extraction_strategy=JsonCssExtractionStrategy({ - # Works on converted PDF structure - "title": {"selector": "h1", "type": "text"}, - "sections": {"selector": "h2", "type": "list"} - }) - ) -) -``` ## 🔧 Important Changes diff --git a/docs/md_v2/core/adaptive-crawling.md b/docs/md_v2/core/adaptive-crawling.md index 72e11937..ea1674c2 100644 --- a/docs/md_v2/core/adaptive-crawling.md +++ b/docs/md_v2/core/adaptive-crawling.md @@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler async def main(): async with AsyncWebCrawler() as crawler: - # Create an adaptive crawler + # Create an adaptive crawler (config is optional) adaptive = AdaptiveCrawler(crawler) # Start crawling with a query @@ -59,13 +59,13 @@ async def main(): from crawl4ai import AdaptiveConfig config = AdaptiveConfig( - confidence_threshold=0.7, # Stop when 70% confident (default: 0.8) - max_pages=20, # Maximum pages to crawl (default: 50) - top_k_links=3, # Links to follow per page (default: 5) + confidence_threshold=0.8, # Stop when 80% confident (default: 0.7) + max_pages=30, # Maximum pages to crawl (default: 20) + top_k_links=5, # Links to follow per page (default: 3) min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1) ) -adaptive = AdaptiveCrawler(crawler, config=config) +adaptive = AdaptiveCrawler(crawler, config) ``` ## Crawling Strategies @@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False): The confidence score (0-1) indicates how sufficient the gathered information is: - **0.0-0.3**: Insufficient information, needs more crawling - **0.3-0.6**: Partial information, may answer basic queries -- **0.6-0.8**: Good coverage, can answer most queries -- **0.8-1.0**: Excellent coverage, comprehensive information +- **0.6-0.7**: Good coverage, can answer most queries +- **0.7-1.0**: Excellent coverage, comprehensive information ### Statistics Display @@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl") - Avoid overly broad queries ### 2. Threshold Tuning -- Start with default (0.8) for general use -- Lower to 0.6-0.7 for exploratory crawling -- Raise to 0.9+ for exhaustive coverage +- Start with default (0.7) for general use +- Lower to 0.5-0.6 for exploratory crawling +- Raise to 0.8+ for exhaustive coverage ### 3. Performance Optimization - Use appropriate `max_pages` limits diff --git a/docs/md_v2/core/url-seeding.md b/docs/md_v2/core/url-seeding.md index 24cdfa46..f891c204 100644 --- a/docs/md_v2/core/url-seeding.md +++ b/docs/md_v2/core/url-seeding.md @@ -137,7 +137,7 @@ async def smart_blog_crawler(): word_count_threshold=300 # Only substantial articles ) - # Extract URLs and stream results as they come + # Extract URLs and crawl them tutorial_urls = [t["url"] for t in tutorials[:10]] results = await crawler.arun_many(tutorial_urls, config=config) @@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I ```python # Use both sources -config = SeedingConfig(source="cc+sitemap") +config = SeedingConfig(source="sitemap+cc") urls = await seeder.urls("example.com", config) ``` @@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" | +| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" | | `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") | | `extract_head` | bool | False | Extract metadata from page `` | | `live_check` | bool | False | Verify URLs are accessible | | `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) | | `concurrency` | int | 10 | Parallel workers for fetching | -| `hits_per_sec` | int | None | Rate limit for requests | +| `hits_per_sec` | int | 5 | Rate limit for requests | | `force` | bool | False | Bypass cache, fetch fresh data | | `verbose` | bool | False | Show detailed progress | | `query` | str | None | Search query for BM25 scoring | @@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config) ```python # Find specific products config = SeedingConfig( - source="cc+sitemap", # Use both sources + source="sitemap+cc", # Use both sources extract_head=True, query="wireless headphones noise canceling", scoring_method="bm25", @@ -782,7 +782,7 @@ class ResearchAssistant: # Step 1: Discover relevant URLs config = SeedingConfig( - source="cc+sitemap", # Maximum coverage + source="sitemap+cc", # Maximum coverage extract_head=True, # Get metadata query=topic, # Research topic scoring_method="bm25", # Smart scoring @@ -832,7 +832,8 @@ class ResearchAssistant: # Extract URLs and crawl all articles article_urls = [article['url'] for article in top_articles] results = [] - async for result in await crawler.arun_many(article_urls, config=config): + crawl_results = await crawler.arun_many(article_urls, config=config) + async for result in crawl_results: if result.success: results.append({ 'url': result.url, @@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5) # When crawling many URLs async with AsyncWebCrawler() as crawler: # Assuming urls is a list of URL strings - results = await crawler.arun_many(urls, config=config) + crawl_results = await crawler.arun_many(urls, config=config) # Process as they arrive - async for result in results: + async for result in crawl_results: process_immediately(result) # Don't wait for all ``` @@ -1020,7 +1021,7 @@ config = SeedingConfig( # E-commerce product discovery config = SeedingConfig( - source="cc+sitemap", + source="sitemap+cc", pattern="*/product/*", extract_head=True, live_check=True From 0eaa9f9895cd7b1f7bbe40e5f825c13d05bd0f5d Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 13:49:07 +0200 Subject: [PATCH 6/9] fix: handle infinity values in JSON serialization for API responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add sanitize_json_data() function to convert infinity/NaN to JSON-compliant strings - Fix /execute_js endpoint returning ValueError: Out of range float values are not JSON compliant: inf - Fix /crawl endpoint batch responses with infinity values - Fix /crawl/stream endpoint streaming responses with infinity values - Fix /crawl/job endpoint background job responses with infinity values The sanitize_json_data() function recursively processes response data: - float('inf') → \"Infinity\" - float('-inf') → \"-Infinity\" - float('nan') → \"NaN\" This prevents JSON serialization errors when JavaScript execution or crawling operations produce infinity values, ensuring all API endpoints return valid JSON. Fixes: API endpoints crashing with infinity JSON serialization errors Affects: /execute_js, /crawl, /crawl/stream, /crawl/job endpoints --- deploy/docker/api.py | 29 ++- deploy/docker/server.py | 25 ++- tests/docker/simple_api_test.py | 345 ++++++++++++++++++++++++++++++++ 3 files changed, 395 insertions(+), 4 deletions(-) create mode 100644 tests/docker/simple_api_test.py diff --git a/deploy/docker/api.py b/deploy/docker/api.py index b728acd1..f6df5e3f 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -54,6 +54,27 @@ def _get_memory_mb(): logger.warning(f"Could not get memory info: {e}") return None +# --- Helper to sanitize JSON data --- +def sanitize_json_data(data): + """ + Recursively sanitize data to handle infinity and NaN values that are not JSON compliant. + """ + import math + + if isinstance(data, dict): + return {k: sanitize_json_data(v) for k, v in data.items()} + elif isinstance(data, list): + return [sanitize_json_data(item) for item in data] + elif isinstance(data, float): + if math.isinf(data): + return "Infinity" if data > 0 else "-Infinity" + elif math.isnan(data): + return "NaN" + else: + return data + else: + return data + async def handle_llm_qa( url: str, @@ -371,8 +392,10 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) server_memory_mb = _get_memory_mb() result_dict = result.model_dump() result_dict['server_memory_mb'] = server_memory_mb - logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}") - data = json.dumps(result_dict, default=datetime_handler) + "\n" + # Sanitize data to handle infinity values + sanitized_dict = sanitize_json_data(result_dict) + logger.info(f"Streaming result for {sanitized_dict.get('url', 'unknown')}") + data = json.dumps(sanitized_dict, default=datetime_handler) + "\n" yield data.encode('utf-8') except Exception as e: logger.error(f"Serialization error: {e}") @@ -446,7 +469,7 @@ async def handle_crawl_request( return { "success": True, - "results": [result.model_dump() for result in results], + "results": [sanitize_json_data(result.model_dump()) for result in results], "server_processing_time_s": end_time - start_time, "server_memory_delta_mb": mem_delta_mb, "server_peak_memory_mb": peak_mem_mb diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 0bd6ac2d..d410c710 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -331,6 +331,27 @@ async def generate_pdf( return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} +def sanitize_json_data(data): + """ + Recursively sanitize data to handle infinity and NaN values that are not JSON compliant. + """ + import math + + if isinstance(data, dict): + return {k: sanitize_json_data(v) for k, v in data.items()} + elif isinstance(data, list): + return [sanitize_json_data(item) for item in data] + elif isinstance(data, float): + if math.isinf(data): + return "Infinity" if data > 0 else "-Infinity" + elif math.isnan(data): + return "NaN" + else: + return data + else: + return data + + @app.post("/execute_js") @limiter.limit(config["rate_limiting"]["default_limit"]) @mcp_tool("execute_js") @@ -389,7 +410,9 @@ async def execute_js( results = await crawler.arun(url=body.url, config=cfg) # Return JSON-serializable dict of the first CrawlResult data = results[0].model_dump() - return JSONResponse(data) + # Sanitize data to handle infinity values + sanitized_data = sanitize_json_data(data) + return JSONResponse(sanitized_data) @app.get("/llm/{url:path}") diff --git a/tests/docker/simple_api_test.py b/tests/docker/simple_api_test.py new file mode 100644 index 00000000..0a966d5e --- /dev/null +++ b/tests/docker/simple_api_test.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +""" +Simple API Test for Crawl4AI Docker Server v0.7.0 +Uses only built-in Python modules to test all endpoints. +""" + +import urllib.request +import urllib.parse +import json +import time +import sys +from typing import Dict, List, Optional + +# Configuration +BASE_URL = "http://localhost:11234" # Change to your server URL +TEST_TIMEOUT = 30 + +class SimpleApiTester: + def __init__(self, base_url: str = BASE_URL): + self.base_url = base_url + self.token = None + self.results = [] + + def log(self, message: str): + print(f"[INFO] {message}") + + def test_get_endpoint(self, endpoint: str) -> Dict: + """Test a GET endpoint""" + url = f"{self.base_url}{endpoint}" + start_time = time.time() + + try: + req = urllib.request.Request(url) + if self.token: + req.add_header('Authorization', f'Bearer {self.token}') + + with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response: + response_time = time.time() - start_time + status_code = response.getcode() + content = response.read().decode('utf-8') + + # Try to parse JSON + try: + data = json.loads(content) + except: + data = {"raw_response": content[:200]} + + return { + "endpoint": endpoint, + "method": "GET", + "status": "PASS" if status_code < 400 else "FAIL", + "status_code": status_code, + "response_time": response_time, + "data": data + } + except Exception as e: + response_time = time.time() - start_time + return { + "endpoint": endpoint, + "method": "GET", + "status": "FAIL", + "status_code": None, + "response_time": response_time, + "error": str(e) + } + + def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict: + """Test a POST endpoint""" + url = f"{self.base_url}{endpoint}" + start_time = time.time() + + try: + data = json.dumps(payload).encode('utf-8') + req = urllib.request.Request(url, data=data, method='POST') + req.add_header('Content-Type', 'application/json') + + if self.token: + req.add_header('Authorization', f'Bearer {self.token}') + + with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response: + response_time = time.time() - start_time + status_code = response.getcode() + content = response.read().decode('utf-8') + + # Try to parse JSON + try: + data = json.loads(content) + except: + data = {"raw_response": content[:200]} + + return { + "endpoint": endpoint, + "method": "POST", + "status": "PASS" if status_code < 400 else "FAIL", + "status_code": status_code, + "response_time": response_time, + "data": data + } + except Exception as e: + response_time = time.time() - start_time + return { + "endpoint": endpoint, + "method": "POST", + "status": "FAIL", + "status_code": None, + "response_time": response_time, + "error": str(e) + } + + def print_result(self, result: Dict): + """Print a formatted test result""" + status_color = { + "PASS": "✅", + "FAIL": "❌", + "SKIP": "⏭️" + } + + print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} " + f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}") + + if result['status'] == 'FAIL' and 'error' in result: + print(f" Error: {result['error']}") + + self.results.append(result) + + def run_all_tests(self): + """Run all API tests""" + print("🚀 Starting Crawl4AI v0.7.0 API Test Suite") + print(f"📡 Testing server at: {self.base_url}") + print("=" * 60) + + # # Test basic endpoints + # print("\n=== BASIC ENDPOINTS ===") + + # # Health check + # result = self.test_get_endpoint("/health") + # self.print_result(result) + + + # # Schema endpoint + # result = self.test_get_endpoint("/schema") + # self.print_result(result) + + # # Metrics endpoint + # result = self.test_get_endpoint("/metrics") + # self.print_result(result) + + # # Root redirect + # result = self.test_get_endpoint("/") + # self.print_result(result) + + # # Test authentication + # print("\n=== AUTHENTICATION ===") + + # # Get token + # token_payload = {"email": "test@example.com"} + # result = self.test_post_endpoint("/token", token_payload) + # self.print_result(result) + + # # Extract token if successful + # if result['status'] == 'PASS' and 'data' in result: + # token = result['data'].get('access_token') + # if token: + # self.token = token + # self.log(f"Successfully obtained auth token: {token[:20]}...") + + # Test core APIs + print("\n=== CORE APIs ===") + + test_url = "https://example.com" + + # Test markdown endpoint + md_payload = { + "url": test_url, + "f": "fit", + "q": "test query", + "c": "0" + } + result = self.test_post_endpoint("/md", md_payload) + # print(result['data'].get('markdown', '')) + self.print_result(result) + + # Test HTML endpoint + html_payload = {"url": test_url} + result = self.test_post_endpoint("/html", html_payload) + self.print_result(result) + + # Test screenshot endpoint + screenshot_payload = { + "url": test_url, + "screenshot_wait_for": 2 + } + result = self.test_post_endpoint("/screenshot", screenshot_payload) + self.print_result(result) + + # Test PDF endpoint + pdf_payload = {"url": test_url} + result = self.test_post_endpoint("/pdf", pdf_payload) + self.print_result(result) + + # Test JavaScript execution + js_payload = { + "url": test_url, + "scripts": ["(() => document.title)()"] + } + result = self.test_post_endpoint("/execute_js", js_payload) + self.print_result(result) + + # Test crawl endpoint + crawl_payload = { + "urls": [test_url], + "browser_config": {}, + "crawler_config": {} + } + result = self.test_post_endpoint("/crawl", crawl_payload) + self.print_result(result) + + # Test config dump + config_payload = {"code": "CrawlerRunConfig()"} + result = self.test_post_endpoint("/config/dump", config_payload) + self.print_result(result) + + # Test LLM endpoint + llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content" + result = self.test_get_endpoint(llm_endpoint) + self.print_result(result) + + # Test ask endpoint + ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5" + result = self.test_get_endpoint(ask_endpoint) + print(result) + self.print_result(result) + + # Test job APIs + print("\n=== JOB APIs ===") + + # Test LLM job + llm_job_payload = { + "url": test_url, + "q": "Extract main content", + "cache": False + } + result = self.test_post_endpoint("/llm/job", llm_job_payload) + self.print_result(result) + + # Test crawl job + crawl_job_payload = { + "urls": [test_url], + "browser_config": {}, + "crawler_config": {} + } + result = self.test_post_endpoint("/crawl/job", crawl_job_payload) + self.print_result(result) + + # Test MCP + print("\n=== MCP APIs ===") + + # Test MCP schema + result = self.test_get_endpoint("/mcp/schema") + self.print_result(result) + + # Test error handling + print("\n=== ERROR HANDLING ===") + + # Test invalid URL + invalid_payload = {"url": "invalid-url", "f": "fit"} + result = self.test_post_endpoint("/md", invalid_payload) + self.print_result(result) + + # Test invalid endpoint + result = self.test_get_endpoint("/nonexistent") + self.print_result(result) + + # Print summary + self.print_summary() + + def print_summary(self): + """Print test results summary""" + print("\n" + "=" * 60) + print("📊 TEST RESULTS SUMMARY") + print("=" * 60) + + total = len(self.results) + passed = sum(1 for r in self.results if r['status'] == 'PASS') + failed = sum(1 for r in self.results if r['status'] == 'FAIL') + + print(f"Total Tests: {total}") + print(f"✅ Passed: {passed}") + print(f"❌ Failed: {failed}") + print(f"📈 Success Rate: {(passed/total)*100:.1f}%") + + if failed > 0: + print("\n❌ FAILED TESTS:") + for result in self.results: + if result['status'] == 'FAIL': + print(f" • {result['method']} {result['endpoint']}") + if 'error' in result: + print(f" Error: {result['error']}") + + # Performance statistics + response_times = [r['response_time'] for r in self.results if r['response_time'] > 0] + if response_times: + avg_time = sum(response_times) / len(response_times) + max_time = max(response_times) + print(f"\n⏱️ Average Response Time: {avg_time:.3f}s") + print(f"⏱️ Max Response Time: {max_time:.3f}s") + + # Save detailed report + report_file = f"crawl4ai_test_report_{int(time.time())}.json" + with open(report_file, 'w') as f: + json.dump({ + "timestamp": time.time(), + "server_url": self.base_url, + "version": "0.7.0", + "summary": { + "total": total, + "passed": passed, + "failed": failed + }, + "results": self.results + }, f, indent=2) + + print(f"\n📄 Detailed report saved to: {report_file}") + +def main(): + """Main test runner""" + import argparse + + parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite') + parser.add_argument('--url', default=BASE_URL, help='Base URL of the server') + + args = parser.parse_args() + + tester = SimpleApiTester(args.url) + + try: + tester.run_all_tests() + except KeyboardInterrupt: + print("\n🛑 Test suite interrupted by user") + except Exception as e: + print(f"\n💥 Test suite failed with error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file From ccbe3c105c01081109167a5a114c5cf029bc94f4 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 17 Jul 2025 09:13:20 +0200 Subject: [PATCH 7/9] refactor: improve link scoring output format in release notes --- docs/blog/release-v0.7.0.md | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 0772ae58..49f34021 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -174,23 +174,20 @@ internal_links = result.links.get("internal", []) scored_links = [l for l in internal_links if l.get("total_score")] scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True) -# Create a scoring table -table = Table(title="Link Scoring Results", box=box.ROUNDED) -table.add_column("Link Text", style="cyan", width=40) -table.add_column("Intrinsic Score", justify="center") -table.add_column("Contextual Score", justify="center") -table.add_column("Total Score", justify="center", style="bold green") - +# Print scoring results +print("Link Scoring Results:") +print("=" * 50) for link in scored_links[:5]: text = link.get('text', 'No text')[:40] - table.add_row( - text, - f"{link.get('intrinsic_score', 0):.1f}/10", - f"{link.get('contextual_score', 0):.2f}/1", - f"{link.get('total_score', 0):.3f}" - ) - -console.print(table) + intrinsic = link.get('intrinsic_score', 0) + contextual = link.get('contextual_score', 0) + total = link.get('total_score', 0) + + print(f"Link: {text}") + print(f" Intrinsic Score: {intrinsic:.1f}/10") + print(f" Contextual Score: {contextual:.2f}/1") + print(f" Total Score: {total:.3f}") + print("-" * 30) ``` **Scoring Components:** From cf8badfe276b807c2835a52814db17136d02d1d0 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 17 Jul 2025 11:35:16 +0200 Subject: [PATCH 8/9] feat: cleanup unused code and enhance documentation for v0.7.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unused StealthConfig from browser_manager.py - Update LinkPreviewConfig import path in __init__.py and examples - Fix infinity handling in content_scraping_strategy.py (use 0 instead of float('inf')) - Remove sanitize_json_data functions from API endpoints - Add comprehensive C4A Script documentation to release notes - Update v0.7.0 release notes with improved code examples - Create v0.7.1 release notes focusing on cleanup and documentation improvements - Update demo files with corrected import paths and examples - Fix virtual scroll and adaptive crawling examples across documentation 🤖 Generated with Claude Code Co-Authored-By: Claude --- crawl4ai/__init__.py | 3 +- crawl4ai/browser_manager.py | 15 -- crawl4ai/content_scraping_strategy.py | 4 +- deploy/docker/api.py | 29 +-- deploy/docker/server.py | 25 +-- docs/blog/release-v0.7.0.md | 203 ++++++++--------- docs/blog/release-v0.7.1.md | 43 ++++ docs/examples/link_head_extraction_example.py | 2 +- docs/md_v2/blog/releases/0.7.0.md | 206 ++++++++---------- docs/md_v2/core/link-media.md | 4 +- .../crawl4ai_v0_7_0_showcase.py | 2 +- docs/releases_review/v0_7_0_features_demo.py | 44 +--- tests/test_link_extractor.py | 4 +- 13 files changed, 241 insertions(+), 343 deletions(-) create mode 100644 docs/blog/release-v0.7.1.md diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 7a75e76d..be3cab0a 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ import warnings from .async_webcrawler import AsyncWebCrawler, CacheMode # MODIFIED: Add SeedingConfig and VirtualScrollConfig here -from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig from .content_scraping_strategy import ( ContentScrapingStrategy, @@ -173,6 +173,7 @@ __all__ = [ "CompilationResult", "ValidationResult", "ErrorDetail", + "LinkPreviewConfig" ] diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 6ee43961..08c1f52f 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -14,23 +14,8 @@ import hashlib from .js_snippet import load_js_script from .config import DOWNLOAD_PAGE_TIMEOUT from .async_configs import BrowserConfig, CrawlerRunConfig -from playwright_stealth import StealthConfig from .utils import get_chromium_path -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) BROWSER_DISABLE_OPTIONS = [ "--disable-background-networking", diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index f1ea5fa5..3751d52f 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1145,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): link_data["intrinsic_score"] = intrinsic_score except Exception: # Fail gracefully - assign default score - link_data["intrinsic_score"] = float('inf') + link_data["intrinsic_score"] = 0 else: # No scoring enabled - assign infinity (all links equal priority) - link_data["intrinsic_score"] = float('inf') + link_data["intrinsic_score"] = 0 is_external = is_external_url(normalized_href, base_domain) if is_external: diff --git a/deploy/docker/api.py b/deploy/docker/api.py index f6df5e3f..b728acd1 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -54,27 +54,6 @@ def _get_memory_mb(): logger.warning(f"Could not get memory info: {e}") return None -# --- Helper to sanitize JSON data --- -def sanitize_json_data(data): - """ - Recursively sanitize data to handle infinity and NaN values that are not JSON compliant. - """ - import math - - if isinstance(data, dict): - return {k: sanitize_json_data(v) for k, v in data.items()} - elif isinstance(data, list): - return [sanitize_json_data(item) for item in data] - elif isinstance(data, float): - if math.isinf(data): - return "Infinity" if data > 0 else "-Infinity" - elif math.isnan(data): - return "NaN" - else: - return data - else: - return data - async def handle_llm_qa( url: str, @@ -392,10 +371,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) server_memory_mb = _get_memory_mb() result_dict = result.model_dump() result_dict['server_memory_mb'] = server_memory_mb - # Sanitize data to handle infinity values - sanitized_dict = sanitize_json_data(result_dict) - logger.info(f"Streaming result for {sanitized_dict.get('url', 'unknown')}") - data = json.dumps(sanitized_dict, default=datetime_handler) + "\n" + logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}") + data = json.dumps(result_dict, default=datetime_handler) + "\n" yield data.encode('utf-8') except Exception as e: logger.error(f"Serialization error: {e}") @@ -469,7 +446,7 @@ async def handle_crawl_request( return { "success": True, - "results": [sanitize_json_data(result.model_dump()) for result in results], + "results": [result.model_dump() for result in results], "server_processing_time_s": end_time - start_time, "server_memory_delta_mb": mem_delta_mb, "server_peak_memory_mb": peak_mem_mb diff --git a/deploy/docker/server.py b/deploy/docker/server.py index d410c710..0bd6ac2d 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -331,27 +331,6 @@ async def generate_pdf( return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} -def sanitize_json_data(data): - """ - Recursively sanitize data to handle infinity and NaN values that are not JSON compliant. - """ - import math - - if isinstance(data, dict): - return {k: sanitize_json_data(v) for k, v in data.items()} - elif isinstance(data, list): - return [sanitize_json_data(item) for item in data] - elif isinstance(data, float): - if math.isinf(data): - return "Infinity" if data > 0 else "-Infinity" - elif math.isnan(data): - return "NaN" - else: - return data - else: - return data - - @app.post("/execute_js") @limiter.limit(config["rate_limiting"]["default_limit"]) @mcp_tool("execute_js") @@ -410,9 +389,7 @@ async def execute_js( results = await crawler.arun(url=body.url, config=cfg) # Return JSON-serializable dict of the first CrawlResult data = results[0].model_dump() - # Sanitize data to handle infinity values - sanitized_data = sanitize_json_data(data) - return JSONResponse(sanitized_data) + return JSONResponse(data) @app.get("/llm/{url:path}") diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 49f34021..1b474a99 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: ```python from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig +import asyncio -# Initialize with custom adaptive parameters -config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to stop crawling - max_depth=5, # Maximum crawl depth - max_pages=20, # Maximum number of pages to crawl - top_k_links=3, # Number of top links to follow per page - strategy="statistical", # 'statistical' or 'embedding' - coverage_weight=0.4, # Weight for coverage in confidence calculation - consistency_weight=0.3, # Weight for consistency in confidence calculation - saturation_weight=0.3 # Weight for saturation in confidence calculation -) - -# Initialize adaptive crawler with web crawler -async with AsyncWebCrawler() as crawler: - adaptive_crawler = AdaptiveCrawler(crawler, config) +async def main(): - # Crawl and learn patterns - state = await adaptive_crawler.digest( - start_url="https://news.example.com/article/12345", - query="latest news articles and content" + # Configure adaptive crawler + config = AdaptiveConfig( + strategy="statistical", # or "embedding" for semantic understanding + max_pages=10, + confidence_threshold=0.7, # Stop at 70% confidence + top_k_links=3, # Follow top 3 links per page + min_gain_threshold=0.05 # Need 5% information gain to continue ) - # Access results and confidence - print(f"Confidence Level: {adaptive_crawler.confidence:.0%}") - print(f"Pages Crawled: {len(state.crawled_urls)}") - print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents") + async with AsyncWebCrawler(verbose=False) as crawler: + adaptive = AdaptiveCrawler(crawler, config) + + print("Starting adaptive crawl about Python decorators...") + result = await adaptive.digest( + start_url="https://docs.python.org/3/glossary.html", + query="python decorators functions wrapping" + ) + + print(f"\n✅ Crawling Complete!") + print(f"• Confidence Level: {adaptive.confidence:.0%}") + print(f"• Pages Crawled: {len(result.crawled_urls)}") + print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents") + + # Get most relevant content + relevant = adaptive.get_relevant_content(top_k=3) + print(f"\nMost Relevant Pages:") + for i, page in enumerate(relevant, 1): + print(f"{i}. {page['url']} (relevance: {page['score']:.2%})") + +asyncio.run(main()) ``` **Expected Real-World Impact:** @@ -141,53 +148,47 @@ async with AsyncWebCrawler() as crawler: **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals. -### The Three-Layer Scoring System +### Intelligent Link Analysis and Scoring ```python -from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode +import asyncio +from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler +from crawl4ai.adaptive_crawler import LinkPreviewConfig -# Configure intelligent link analysis -link_config = LinkPreviewConfig( - include_internal=True, - include_external=False, - max_links=10, - concurrency=5, - query="python tutorial", # For contextual scoring - score_threshold=0.3, - verbose=True -) - -# Use in your crawl -result = await crawler.arun( - "https://tech-blog.example.com", - config=CrawlerRunConfig( - link_preview_config=link_config, - score_links=True, # Enable intrinsic scoring - cache_mode=CacheMode.BYPASS +async def main(): + # Configure intelligent link analysis + link_config = LinkPreviewConfig( + include_internal=True, + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) -) + # Use in your crawl + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://www.geeksforgeeks.org/", + config=CrawlerRunConfig( + link_preview_config=link_config, + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS + ) + ) -# Access scored and sorted links -if result.success and result.links: -# Get scored links -internal_links = result.links.get("internal", []) -scored_links = [l for l in internal_links if l.get("total_score")] -scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True) + # Access scored and sorted links + if result.success and result.links: + for link in result.links.get("internal", []): + text = link.get('text', 'No text')[:40] + print( + text, + f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10", + f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1", + f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000" + ) -# Print scoring results -print("Link Scoring Results:") -print("=" * 50) -for link in scored_links[:5]: - text = link.get('text', 'No text')[:40] - intrinsic = link.get('intrinsic_score', 0) - contextual = link.get('contextual_score', 0) - total = link.get('total_score', 0) - - print(f"Link: {text}") - print(f" Intrinsic Score: {intrinsic:.1f}/10") - print(f" Contextual Score: {contextual:.2f}/1") - print(f" Total Score: {total:.3f}") - print("-" * 30) +asyncio.run(main()) ``` **Scoring Components:** @@ -220,58 +221,34 @@ for link in scored_links[:5]: ### Technical Architecture ```python +import asyncio from crawl4ai import AsyncUrlSeeder, SeedingConfig -# Basic discovery - find all product pages -seeder_config = SeedingConfig( - # Discovery sources - source="cc+sitemap", # Sitemap + Common Crawl - - # Filtering - pattern="*/product/*", # URL pattern matching - - # Validation - live_check=True, # Verify URLs are alive - max_urls=50, # Stop at 50 URLs - - # Performance - concurrency=100, # Maximum concurrent requests for live checks/head extraction - hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers -) +async def main(): + async with AsyncUrlSeeder() as seeder: + # Discover Python tutorial URLs + config = SeedingConfig( + source="sitemap", # Use sitemap + pattern="*python*", # URL pattern filter + extract_head=True, # Get metadata + query="python tutorial", # For relevance scoring + scoring_method="bm25", + score_threshold=0.2, + max_urls=10 + ) + + print("Discovering Python async tutorial URLs...") + urls = await seeder.urls("https://www.geeksforgeeks.org/", config) + + print(f"\n✅ Found {len(urls)} relevant URLs:") + for i, url_info in enumerate(urls[:5], 1): + print(f"\n{i}. {url_info['url']}") + if url_info.get('relevance_score'): + print(f" Relevance: {url_info['relevance_score']:.3f}") + if url_info.get('head_data', {}).get('title'): + print(f" Title: {url_info['head_data']['title'][:60]}...") -async with AsyncUrlSeeder() as seeder: - console.print("Discovering URLs from Python docs...") - urls = await seeder.urls("docs.python.org", seeding_config) - console.print(f"\n✓ Discovered {len(urls)} URLs") - -# Advanced: Relevance-based discovery -research_config = SeedingConfig( - source="sitemap+cc", # Sitemap + Common Crawl - pattern="*/blog/*", # Blog posts only - - # Content relevance - extract_head=True, # Get meta tags - query="quantum computing tutorials", - scoring_method="bm25", # BM25 scoring method - score_threshold=0.4, # High relevance only - - # Smart filtering - filter_nonsense_urls=True, # Remove .xml, .txt, etc. - - force=True # Bypass cache -) - -# Discover with progress tracking -discovered = [] -async with AsyncUrlSeeder() as seeder: - discovered = await seeder.urls("https://physics-blog.com", research_config) - console.print(f"\n✓ Discovered {len(discovered)} URLs") - -# Results include scores and metadata -for url_data in discovered[:5]: - print(f"URL: {url_data['url']}") - print(f"Score: {url_data['relevance_score']:.3f}") - print(f"Title: {url_data['head_data']['title']}") +asyncio.run(main()) ``` **Discovery Methods:** diff --git a/docs/blog/release-v0.7.1.md b/docs/blog/release-v0.7.1.md new file mode 100644 index 00000000..d5bfdaec --- /dev/null +++ b/docs/blog/release-v0.7.1.md @@ -0,0 +1,43 @@ +# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update + +*July 17, 2025 • 2 min read* + +--- + +A small maintenance release that removes unused code and improves documentation. + +## 🎯 What's Changed + +- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py` +- **Updated documentation** with better examples and parameter explanations +- **Fixed virtual scroll configuration** examples in docs + +## 🧹 Code Cleanup + +Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead. + +```python +# Removed unused code: +from playwright_stealth import StealthConfig +stealth_config = StealthConfig(...) # This was never used +``` + +## 📖 Documentation Updates + +- Fixed adaptive crawling parameter examples +- Updated session management documentation +- Corrected virtual scroll configuration examples + +## 🚀 Installation + +```bash +pip install crawl4ai==0.7.1 +``` + +No breaking changes - upgrade directly from v0.7.0. + +--- + +Questions? Issues? +- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN) \ No newline at end of file diff --git a/docs/examples/link_head_extraction_example.py b/docs/examples/link_head_extraction_example.py index ef146d95..500566ab 100644 --- a/docs/examples/link_head_extraction_example.py +++ b/docs/examples/link_head_extraction_example.py @@ -18,7 +18,7 @@ Usage: import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig async def basic_link_head_extraction(): diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 0772ae58..1b474a99 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: ```python from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig +import asyncio -# Initialize with custom adaptive parameters -config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to stop crawling - max_depth=5, # Maximum crawl depth - max_pages=20, # Maximum number of pages to crawl - top_k_links=3, # Number of top links to follow per page - strategy="statistical", # 'statistical' or 'embedding' - coverage_weight=0.4, # Weight for coverage in confidence calculation - consistency_weight=0.3, # Weight for consistency in confidence calculation - saturation_weight=0.3 # Weight for saturation in confidence calculation -) - -# Initialize adaptive crawler with web crawler -async with AsyncWebCrawler() as crawler: - adaptive_crawler = AdaptiveCrawler(crawler, config) +async def main(): - # Crawl and learn patterns - state = await adaptive_crawler.digest( - start_url="https://news.example.com/article/12345", - query="latest news articles and content" + # Configure adaptive crawler + config = AdaptiveConfig( + strategy="statistical", # or "embedding" for semantic understanding + max_pages=10, + confidence_threshold=0.7, # Stop at 70% confidence + top_k_links=3, # Follow top 3 links per page + min_gain_threshold=0.05 # Need 5% information gain to continue ) - # Access results and confidence - print(f"Confidence Level: {adaptive_crawler.confidence:.0%}") - print(f"Pages Crawled: {len(state.crawled_urls)}") - print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents") + async with AsyncWebCrawler(verbose=False) as crawler: + adaptive = AdaptiveCrawler(crawler, config) + + print("Starting adaptive crawl about Python decorators...") + result = await adaptive.digest( + start_url="https://docs.python.org/3/glossary.html", + query="python decorators functions wrapping" + ) + + print(f"\n✅ Crawling Complete!") + print(f"• Confidence Level: {adaptive.confidence:.0%}") + print(f"• Pages Crawled: {len(result.crawled_urls)}") + print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents") + + # Get most relevant content + relevant = adaptive.get_relevant_content(top_k=3) + print(f"\nMost Relevant Pages:") + for i, page in enumerate(relevant, 1): + print(f"{i}. {page['url']} (relevance: {page['score']:.2%})") + +asyncio.run(main()) ``` **Expected Real-World Impact:** @@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler: **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals. -### The Three-Layer Scoring System +### Intelligent Link Analysis and Scoring ```python -from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode +import asyncio +from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler +from crawl4ai.adaptive_crawler import LinkPreviewConfig -# Configure intelligent link analysis -link_config = LinkPreviewConfig( - include_internal=True, - include_external=False, - max_links=10, - concurrency=5, - query="python tutorial", # For contextual scoring - score_threshold=0.3, - verbose=True -) - -# Use in your crawl -result = await crawler.arun( - "https://tech-blog.example.com", - config=CrawlerRunConfig( - link_preview_config=link_config, - score_links=True, # Enable intrinsic scoring - cache_mode=CacheMode.BYPASS +async def main(): + # Configure intelligent link analysis + link_config = LinkPreviewConfig( + include_internal=True, + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) -) + # Use in your crawl + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://www.geeksforgeeks.org/", + config=CrawlerRunConfig( + link_preview_config=link_config, + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS + ) + ) -# Access scored and sorted links -if result.success and result.links: -# Get scored links -internal_links = result.links.get("internal", []) -scored_links = [l for l in internal_links if l.get("total_score")] -scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True) + # Access scored and sorted links + if result.success and result.links: + for link in result.links.get("internal", []): + text = link.get('text', 'No text')[:40] + print( + text, + f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10", + f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1", + f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000" + ) -# Create a scoring table -table = Table(title="Link Scoring Results", box=box.ROUNDED) -table.add_column("Link Text", style="cyan", width=40) -table.add_column("Intrinsic Score", justify="center") -table.add_column("Contextual Score", justify="center") -table.add_column("Total Score", justify="center", style="bold green") - -for link in scored_links[:5]: - text = link.get('text', 'No text')[:40] - table.add_row( - text, - f"{link.get('intrinsic_score', 0):.1f}/10", - f"{link.get('contextual_score', 0):.2f}/1", - f"{link.get('total_score', 0):.3f}" - ) - -console.print(table) +asyncio.run(main()) ``` **Scoring Components:** @@ -223,58 +221,34 @@ console.print(table) ### Technical Architecture ```python +import asyncio from crawl4ai import AsyncUrlSeeder, SeedingConfig -# Basic discovery - find all product pages -seeder_config = SeedingConfig( - # Discovery sources - source="cc+sitemap", # Sitemap + Common Crawl - - # Filtering - pattern="*/product/*", # URL pattern matching - - # Validation - live_check=True, # Verify URLs are alive - max_urls=50, # Stop at 50 URLs - - # Performance - concurrency=100, # Maximum concurrent requests for live checks/head extraction - hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers -) +async def main(): + async with AsyncUrlSeeder() as seeder: + # Discover Python tutorial URLs + config = SeedingConfig( + source="sitemap", # Use sitemap + pattern="*python*", # URL pattern filter + extract_head=True, # Get metadata + query="python tutorial", # For relevance scoring + scoring_method="bm25", + score_threshold=0.2, + max_urls=10 + ) + + print("Discovering Python async tutorial URLs...") + urls = await seeder.urls("https://www.geeksforgeeks.org/", config) + + print(f"\n✅ Found {len(urls)} relevant URLs:") + for i, url_info in enumerate(urls[:5], 1): + print(f"\n{i}. {url_info['url']}") + if url_info.get('relevance_score'): + print(f" Relevance: {url_info['relevance_score']:.3f}") + if url_info.get('head_data', {}).get('title'): + print(f" Title: {url_info['head_data']['title'][:60]}...") -async with AsyncUrlSeeder() as seeder: - console.print("Discovering URLs from Python docs...") - urls = await seeder.urls("docs.python.org", seeding_config) - console.print(f"\n✓ Discovered {len(urls)} URLs") - -# Advanced: Relevance-based discovery -research_config = SeedingConfig( - source="sitemap+cc", # Sitemap + Common Crawl - pattern="*/blog/*", # Blog posts only - - # Content relevance - extract_head=True, # Get meta tags - query="quantum computing tutorials", - scoring_method="bm25", # BM25 scoring method - score_threshold=0.4, # High relevance only - - # Smart filtering - filter_nonsense_urls=True, # Remove .xml, .txt, etc. - - force=True # Bypass cache -) - -# Discover with progress tracking -discovered = [] -async with AsyncUrlSeeder() as seeder: - discovered = await seeder.urls("https://physics-blog.com", research_config) - console.print(f"\n✓ Discovered {len(discovered)} URLs") - -# Results include scores and metadata -for url_data in discovered[:5]: - print(f"URL: {url_data['url']}") - print(f"Score: {url_data['relevance_score']:.3f}") - print(f"Title: {url_data['head_data']['title']}") +asyncio.run(main()) ``` **Discovery Methods:** diff --git a/docs/md_v2/core/link-media.md b/docs/md_v2/core/link-media.md index f6305ccc..ebce30bd 100644 --- a/docs/md_v2/core/link-media.md +++ b/docs/md_v2/core/link-media.md @@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately: ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig async def extract_link_heads_example(): """ @@ -237,7 +237,7 @@ if __name__ == "__main__": The `LinkPreviewConfig` class supports these options: ```python -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig link_preview_config = LinkPreviewConfig( # BASIC SETTINGS diff --git a/docs/releases_review/crawl4ai_v0_7_0_showcase.py b/docs/releases_review/crawl4ai_v0_7_0_showcase.py index eca03d04..29c056f0 100644 --- a/docs/releases_review/crawl4ai_v0_7_0_showcase.py +++ b/docs/releases_review/crawl4ai_v0_7_0_showcase.py @@ -28,7 +28,7 @@ from rich import box from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode from crawl4ai import AsyncUrlSeeder, SeedingConfig -from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig +from crawl4ai import LinkPreviewConfig, VirtualScrollConfig from crawl4ai import c4a_compile, CompilationResult # Initialize Rich console for beautiful output diff --git a/docs/releases_review/v0_7_0_features_demo.py b/docs/releases_review/v0_7_0_features_demo.py index 5a68ff48..2f803a3b 100644 --- a/docs/releases_review/v0_7_0_features_demo.py +++ b/docs/releases_review/v0_7_0_features_demo.py @@ -13,14 +13,13 @@ from crawl4ai import ( BrowserConfig, CacheMode, # New imports for v0.7.0 - LinkPreviewConfig, VirtualScrollConfig, + LinkPreviewConfig, AdaptiveCrawler, AdaptiveConfig, AsyncUrlSeeder, SeedingConfig, c4a_compile, - CompilationResult ) @@ -170,16 +169,16 @@ async def demo_url_seeder(): # Discover Python tutorial URLs config = SeedingConfig( source="sitemap", # Use sitemap - pattern="*tutorial*", # URL pattern filter + pattern="*python*", # URL pattern filter extract_head=True, # Get metadata - query="python async programming", # For relevance scoring + query="python tutorial", # For relevance scoring scoring_method="bm25", score_threshold=0.2, max_urls=10 ) print("Discovering Python async tutorial URLs...") - urls = await seeder.urls("docs.python.org", config) + urls = await seeder.urls("https://www.geeksforgeeks.org/", config) print(f"\n✅ Found {len(urls)} relevant URLs:") for i, url_info in enumerate(urls[:5], 1): @@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]` print(f"❌ Compilation error: {result.first_error.message}") -async def demo_pdf_support(): - """ - Demo 6: PDF Parsing Support - - Shows how to extract content from PDF files. - Note: Requires 'pip install crawl4ai[pdf]' - """ - print("\n" + "="*60) - print("📄 DEMO 6: PDF Parsing Support") - print("="*60) - - try: - # Check if PDF support is installed - import PyPDF2 - - # Example: Process a PDF URL - config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - pdf=True, # Enable PDF generation - extract_text_from_pdf=True # Extract text content - ) - - print("PDF parsing is available!") - print("You can now crawl PDF URLs and extract their content.") - print("\nExample usage:") - print(' result = await crawler.arun("https://example.com/document.pdf")') - print(' pdf_text = result.extracted_content # Contains extracted text') - - except ImportError: - print("⚠️ PDF support not installed.") - print("Install with: pip install crawl4ai[pdf]") - - async def main(): """Run all demos""" print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations") @@ -289,7 +255,6 @@ async def main(): ("Virtual Scroll", demo_virtual_scroll), ("URL Seeder", demo_url_seeder), ("C4A Script", demo_c4a_script), - ("PDF Support", demo_pdf_support) ] for name, demo_func in demos: @@ -309,7 +274,6 @@ async def main(): print("• Virtual Scroll: Capture all content from modern web pages") print("• URL Seeder: Pre-discover and filter URLs efficiently") print("• C4A Script: Simple language for complex automations") - print("• PDF Support: Extract content from PDF documents") if __name__ == "__main__": diff --git a/tests/test_link_extractor.py b/tests/test_link_extractor.py index 3f64d7a3..1482ce01 100644 --- a/tests/test_link_extractor.py +++ b/tests/test_link_extractor.py @@ -5,7 +5,7 @@ Test script for Link Extractor functionality from crawl4ai.models import Link from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.async_configs import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig import asyncio import sys import os @@ -237,7 +237,7 @@ def test_config_examples(): print(f" {key}: {value}") print(" Usage:") - print(" from crawl4ai.async_configs import LinkPreviewConfig") + print(" from crawl4ai import LinkPreviewConfig") print(" config = CrawlerRunConfig(") print(" link_preview_config=LinkPreviewConfig(") for key, value in config_dict.items(): From 26bad799e41a7dd6ad26fca4ed9b4e94858cccf8 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Thu, 17 Jul 2025 11:37:41 +0200 Subject: [PATCH 9/9] chore: update version to 0.7.1 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 3a436986..4a3ac419 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,7 +1,7 @@ # crawl4ai/__version__.py # This is the version that will be used for stable releases -__version__ = "0.7.0" +__version__ = "0.7.1" # For nightly builds, this gets set during build process __nightly_version__ = None