From 58024755c589c899e3eac5df702e4482b705ce27 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 10:15:05 +0200 Subject: [PATCH 1/4] docs: Update adaptive crawling parameters and examples in README and release notes --- README.md | 17 +++++----- docs/blog/release-v0.7.0.md | 52 +++++++++++++------------------ docs/md_v2/blog/releases/0.7.0.md | 52 +++++++++++++------------------ 3 files changed, 52 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index b74f386e..97a907a1 100644 --- a/README.md +++ b/README.md @@ -523,15 +523,18 @@ async def test_news_crawl(): - **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically: ```python config = AdaptiveConfig( - confidence_threshold=0.7, - max_history=100, - learning_rate=0.2 + confidence_threshold=0.7, # Min confidence to stop crawling + max_depth=5, # Maximum crawl depth + max_pages=20, # Maximum number of pages to crawl + strategy="statistical" ) - result = await crawler.arun( - "https://news.example.com", - config=CrawlerRunConfig(adaptive_config=config) - ) + async with AsyncWebCrawler() as crawler: + adaptive_crawler = AdaptiveCrawler(crawler, config) + state = await adaptive_crawler.digest( + start_url="https://news.example.com", + query="latest news content" + ) # Crawler learns patterns and improves extraction over time ``` diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 4ae9a689..56fb4914 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: - Extraction confidence scores ```python -from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState +from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig -# Initialize with custom learning parameters +# Initialize with custom adaptive parameters config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to use learned patterns - max_history=100, # Remember last 100 crawls per domain - learning_rate=0.2, # How quickly to adapt to changes - patterns_per_page=3, # Patterns to learn per page type - extraction_strategy='css' # 'css' or 'xpath' + confidence_threshold=0.7, # Min confidence to stop crawling + max_depth=5, # Maximum crawl depth + max_pages=20, # Maximum number of pages to crawl + top_k_links=3, # Number of top links to follow per page + strategy="statistical", # 'statistical' or 'embedding' + coverage_weight=0.4, # Weight for coverage in confidence calculation + consistency_weight=0.3, # Weight for consistency in confidence calculation + saturation_weight=0.3 # Weight for saturation in confidence calculation ) -adaptive_crawler = AdaptiveCrawler(config) - -# First crawl - crawler learns the structure +# Initialize adaptive crawler with web crawler async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://news.example.com/article/12345", - config=CrawlerRunConfig( - adaptive_config=config, - extraction_hints={ # Optional hints to speed up learning - "title": "article h1", - "content": "article .body-content" - } - ) + adaptive_crawler = AdaptiveCrawler(crawler, config) + + # Crawl and learn patterns + state = await adaptive_crawler.digest( + start_url="https://news.example.com/article/12345", + query="latest news articles and content" ) - # Crawler identifies and stores patterns - if result.success: - state = adaptive_crawler.get_state("news.example.com") - print(f"Learned {len(state.patterns)} patterns") - print(f"Confidence: {state.avg_confidence:.2%}") - -# Subsequent crawls - uses learned patterns -result2 = await crawler.arun( - "https://news.example.com/article/67890", - config=CrawlerRunConfig(adaptive_config=config) -) -# Automatically extracts using learned patterns! + # Access results and confidence + print(f"Confidence Level: {adaptive_crawler.confidence:.0%}") + print(f"Pages Crawled: {len(state.crawled_urls)}") + print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents") ``` **Expected Real-World Impact:** diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 4ae9a689..56fb4914 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking: - Extraction confidence scores ```python -from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState +from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig -# Initialize with custom learning parameters +# Initialize with custom adaptive parameters config = AdaptiveConfig( - confidence_threshold=0.7, # Min confidence to use learned patterns - max_history=100, # Remember last 100 crawls per domain - learning_rate=0.2, # How quickly to adapt to changes - patterns_per_page=3, # Patterns to learn per page type - extraction_strategy='css' # 'css' or 'xpath' + confidence_threshold=0.7, # Min confidence to stop crawling + max_depth=5, # Maximum crawl depth + max_pages=20, # Maximum number of pages to crawl + top_k_links=3, # Number of top links to follow per page + strategy="statistical", # 'statistical' or 'embedding' + coverage_weight=0.4, # Weight for coverage in confidence calculation + consistency_weight=0.3, # Weight for consistency in confidence calculation + saturation_weight=0.3 # Weight for saturation in confidence calculation ) -adaptive_crawler = AdaptiveCrawler(config) - -# First crawl - crawler learns the structure +# Initialize adaptive crawler with web crawler async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - "https://news.example.com/article/12345", - config=CrawlerRunConfig( - adaptive_config=config, - extraction_hints={ # Optional hints to speed up learning - "title": "article h1", - "content": "article .body-content" - } - ) + adaptive_crawler = AdaptiveCrawler(crawler, config) + + # Crawl and learn patterns + state = await adaptive_crawler.digest( + start_url="https://news.example.com/article/12345", + query="latest news articles and content" ) - # Crawler identifies and stores patterns - if result.success: - state = adaptive_crawler.get_state("news.example.com") - print(f"Learned {len(state.patterns)} patterns") - print(f"Confidence: {state.avg_confidence:.2%}") - -# Subsequent crawls - uses learned patterns -result2 = await crawler.arun( - "https://news.example.com/article/67890", - config=CrawlerRunConfig(adaptive_config=config) -) -# Automatically extracts using learned patterns! + # Access results and confidence + print(f"Confidence Level: {adaptive_crawler.confidence:.0%}") + print(f"Pages Crawled: {len(state.crawled_urls)}") + print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents") ``` **Expected Real-World Impact:** From 2640dc73a59fe82f45a73662de21bd7d569ed14a Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 10:19:29 +0200 Subject: [PATCH 2/4] docs: Enhance session management example for dynamic content crawling with improved JavaScript handling and extraction schema. ref #226 --- docs/md_v2/advanced/session-management.md | 79 ++++++++++++++++------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index d63b1e80..1007a605 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy from crawl4ai.cache_context import CacheMode async def crawl_dynamic_content(): - async with AsyncWebCrawler() as crawler: - session_id = "github_commits_session" - url = "https://github.com/microsoft/TypeScript/commits/main" - all_commits = [] + url = "https://github.com/microsoft/TypeScript/commits/main" + session_id = "wait_for_session" + all_commits = [] - # Define extraction schema - schema = { - "name": "Commit Extractor", - "baseSelector": "li.Box-sc-g0xbh4-0", - "fields": [{ - "name": "title", "selector": "h4.markdown-title", "type": "text" - }], - } - extraction_strategy = JsonCssExtractionStrategy(schema) + js_next_page = """ + const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4'); + if (commits.length > 0) { + window.lastCommit = commits[0].textContent.trim(); + } + const button = document.querySelector('a[data-testid="pagination-next-button"]'); + if (button) {button.click(); console.log('button clicked') } + """ - # JavaScript and wait configurations - js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();""" - wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0""" - - # Crawl multiple pages + wait_for = """() => { + const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4'); + if (commits.length === 0) return false; + const firstCommit = commits[0].textContent.trim(); + return firstCommit !== window.lastCommit; + }""" + + schema = { + "name": "Commit Extractor", + "baseSelector": "li[data-testid='commit-row-item']", + "fields": [ + { + "name": "title", + "selector": "h4 a", + "type": "text", + "transform": "strip", + }, + ], + } + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + + browser_config = BrowserConfig( + verbose=True, + headless=False, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: for page in range(3): - config = CrawlerRunConfig( - url=url, + crawler_config = CrawlerRunConfig( session_id=session_id, + css_selector="li[data-testid='commit-row-item']", extraction_strategy=extraction_strategy, js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - cache_mode=CacheMode.BYPASS + cache_mode=CacheMode.BYPASS, + capture_console_messages=True, ) - - result = await crawler.arun(config=config) - if result.success: + + result = await crawler.arun(url=url, config=crawler_config) + + if result.console_messages: + print(f"Page {page + 1} console messages:", result.console_messages) + + if result.extracted_content: + # print(f"Page {page + 1} result:", result.extracted_content) commits = json.loads(result.extracted_content) all_commits.extend(commits) print(f"Page {page + 1}: Found {len(commits)} commits") + else: + print(f"Page {page + 1}: No content extracted") + print(f"Successfully crawled {len(all_commits)} commits across 3 pages") # Clean up session await crawler.crawler_strategy.kill_session(session_id) - return all_commits ``` --- From 205df1e33043d87d8d3f731823a76a9173be051b Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 10:29:47 +0200 Subject: [PATCH 3/4] docs: Fix virtual scroll configuration --- docs/blog/release-v0.7.0.md | 11 +++-------- docs/md_v2/blog/releases/0.7.0.md | 11 +++-------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 56fb4914..6bbf670a 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -82,9 +82,7 @@ twitter_config = VirtualScrollConfig( container_selector="[data-testid='primaryColumn']", scroll_count=20, # Number of scrolls scroll_by="container_height", # Smart scrolling by container size - wait_after_scroll=1.0, # Let content load - capture_method="incremental", # Capture new content on each scroll - deduplicate=True # Remove duplicate elements + wait_after_scroll=1.0 # Let content load ) # For e-commerce product grids (Instagram style) @@ -92,8 +90,7 @@ grid_config = VirtualScrollConfig( container_selector="main .product-grid", scroll_count=30, scroll_by=800, # Fixed pixel scrolling - wait_after_scroll=1.5, # Images need time - stop_on_no_change=True # Smart stopping + wait_after_scroll=1.5 # Images need time ) # For news feeds with lazy loading @@ -101,9 +98,7 @@ news_config = VirtualScrollConfig( container_selector=".article-feed", scroll_count=50, scroll_by="page_height", # Viewport-based scrolling - wait_after_scroll=0.5, - wait_for_selector=".article-card", # Wait for specific elements - timeout=30000 # Max 30 seconds total + wait_after_scroll=0.5 # Wait for content to load ) # Use it in your crawl diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 56fb4914..6bbf670a 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -82,9 +82,7 @@ twitter_config = VirtualScrollConfig( container_selector="[data-testid='primaryColumn']", scroll_count=20, # Number of scrolls scroll_by="container_height", # Smart scrolling by container size - wait_after_scroll=1.0, # Let content load - capture_method="incremental", # Capture new content on each scroll - deduplicate=True # Remove duplicate elements + wait_after_scroll=1.0 # Let content load ) # For e-commerce product grids (Instagram style) @@ -92,8 +90,7 @@ grid_config = VirtualScrollConfig( container_selector="main .product-grid", scroll_count=30, scroll_by=800, # Fixed pixel scrolling - wait_after_scroll=1.5, # Images need time - stop_on_no_change=True # Smart stopping + wait_after_scroll=1.5 # Images need time ) # For news feeds with lazy loading @@ -101,9 +98,7 @@ news_config = VirtualScrollConfig( container_selector=".article-feed", scroll_count=50, scroll_by="page_height", # Viewport-based scrolling - wait_after_scroll=0.5, - wait_for_selector=".article-card", # Wait for specific elements - timeout=30000 # Max 30 seconds total + wait_after_scroll=0.5 # Wait for content to load ) # Use it in your crawl From 1d1970ae69f6f886578e7e2ca64bf1cb31a11326 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 15 Jul 2025 11:32:04 +0200 Subject: [PATCH 4/4] docs: Update release notes and docs for v0.7.0 with teh correct parameters and explanations --- docs/blog/release-v0.7.0.md | 152 ++++++++++---------------- docs/md_v2/advanced/virtual-scroll.md | 11 +- docs/md_v2/blog/releases/0.7.0.md | 152 ++++++++++---------------- docs/md_v2/core/adaptive-crawling.md | 20 ++-- docs/md_v2/core/url-seeding.md | 21 ++-- 5 files changed, 146 insertions(+), 210 deletions(-) diff --git a/docs/blog/release-v0.7.0.md b/docs/blog/release-v0.7.0.md index 6bbf670a..0772ae58 100644 --- a/docs/blog/release-v0.7.0.md +++ b/docs/blog/release-v0.7.0.md @@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0β€”the Adaptive Intelligence Update. This rel - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages -- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization +- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering -- **PDF Parsing**: Extract data from PDF documents - **Performance Optimizations**: Significant speed and memory improvements ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning @@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler: ### The Three-Layer Scoring System ```python -from crawl4ai import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode # Configure intelligent link analysis link_config = LinkPreviewConfig( - # What to analyze include_internal=True, - include_external=True, - max_links=100, # Analyze top 100 links - - # Relevance scoring - query="machine learning tutorials", # Your interest - score_threshold=0.3, # Minimum relevance score - - # Performance - concurrent_requests=10, # Parallel processing - timeout_per_link=5000, # 5s per link - - # Advanced scoring weights - scoring_weights={ - "intrinsic": 0.3, # Link quality indicators - "contextual": 0.5, # Relevance to query - "popularity": 0.2 # Link prominence - } + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) # Use in your crawl @@ -175,35 +162,51 @@ result = await crawler.arun( "https://tech-blog.example.com", config=CrawlerRunConfig( link_preview_config=link_config, - score_links=True + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS ) ) # Access scored and sorted links -for link in result.links["internal"][:10]: # Top 10 internal links - print(f"Score: {link['total_score']:.3f}") - print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes - print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query - print(f" URL: {link['href']}") - print(f" Title: {link['head_data']['title']}") - print(f" Description: {link['head_data']['meta']['description'][:100]}...") +if result.success and result.links: +# Get scored links +internal_links = result.links.get("internal", []) +scored_links = [l for l in internal_links if l.get("total_score")] +scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True) + +# Create a scoring table +table = Table(title="Link Scoring Results", box=box.ROUNDED) +table.add_column("Link Text", style="cyan", width=40) +table.add_column("Intrinsic Score", justify="center") +table.add_column("Contextual Score", justify="center") +table.add_column("Total Score", justify="center", style="bold green") + +for link in scored_links[:5]: + text = link.get('text', 'No text')[:40] + table.add_row( + text, + f"{link.get('intrinsic_score', 0):.1f}/10", + f"{link.get('contextual_score', 0):.2f}/1", + f"{link.get('total_score', 0):.3f}" + ) + +console.print(table) ``` **Scoring Components:** -1. **Intrinsic Score (0-10)**: Based on link quality indicators +1. **Intrinsic Score**: Based on link quality indicators - Position on page (navigation, content, footer) - Link attributes (rel, title, class names) - Anchor text quality and length - URL structure and depth -2. **Contextual Score (0-1)**: Relevance to your query - - Semantic similarity using embeddings +2. **Contextual Score**: Relevance to your query using BM25 algorithm - Keyword matching in link text and title - Meta description analysis - Content preview scoring -3. **Total Score**: Weighted combination for final ranking +3. **Total Score**: Combined score for final ranking **Expected Real-World Impact:** - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links @@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig # Basic discovery - find all product pages seeder_config = SeedingConfig( # Discovery sources - source="sitemap+cc", # Sitemap + Common Crawl + source="cc+sitemap", # Sitemap + Common Crawl # Filtering pattern="*/product/*", # URL pattern matching - ignore_patterns=["*/reviews/*", "*/questions/*"], # Validation live_check=True, # Verify URLs are alive - max_urls=5000, # Stop at 5000 URLs + max_urls=50, # Stop at 50 URLs # Performance - concurrency=100, # Parallel requests - hits_per_sec=10 # Rate limiting + concurrency=100, # Maximum concurrent requests for live checks/head extraction + hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers ) -seeder = AsyncUrlSeeder(seeder_config) -urls = await seeder.discover("https://shop.example.com") +async with AsyncUrlSeeder() as seeder: + console.print("Discovering URLs from Python docs...") + urls = await seeder.urls("docs.python.org", seeding_config) + console.print(f"\nβœ“ Discovered {len(urls)} URLs") # Advanced: Relevance-based discovery research_config = SeedingConfig( - source="crawl+sitemap", # Deep crawl + sitemap + source="sitemap+cc", # Sitemap + Common Crawl pattern="*/blog/*", # Blog posts only # Content relevance extract_head=True, # Get meta tags query="quantum computing tutorials", - scoring_method="bm25", # Or "semantic" (coming soon) + scoring_method="bm25", # BM25 scoring method score_threshold=0.4, # High relevance only # Smart filtering filter_nonsense_urls=True, # Remove .xml, .txt, etc. - min_content_length=500, # Skip thin content force=True # Bypass cache ) # Discover with progress tracking discovered = [] -async for batch in seeder.discover_iter("https://physics-blog.com", research_config): - discovered.extend(batch) - print(f"Found {len(discovered)} relevant URLs so far...") +async with AsyncUrlSeeder() as seeder: + discovered = await seeder.urls("https://physics-blog.com", research_config) + console.print(f"\nβœ“ Discovered {len(discovered)} URLs") # Results include scores and metadata for url_data in discovered[:5]: print(f"URL: {url_data['url']}") - print(f"Score: {url_data['score']:.3f}") - print(f"Title: {url_data['title']}") + print(f"Score: {url_data['relevance_score']:.3f}") + print(f"Title: {url_data['head_data']['title']}") ``` **Discovery Methods:** @@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res ### What We Optimized ```python -# Before v0.7.0 (slow) +# Optimized crawling with v0.7.0 improvements results = [] for url in urls: - result = await crawler.arun(url) - results.append(result) - -# After v0.7.0 (fast) -# Automatic batching and connection pooling -results = await crawler.arun_batch( - urls, - config=CrawlerRunConfig( - # New performance options - batch_size=10, # Process 10 URLs concurrently - reuse_browser=True, # Keep browser warm - eager_loading=False, # Load only what's needed - streaming_extraction=True, # Stream large extractions - - # Optimized defaults - wait_until="domcontentloaded", # Faster than networkidle - exclude_external_resources=True, # Skip third-party assets - block_ads=True # Ad blocking built-in + result = await crawler.arun( + url, + config=CrawlerRunConfig( + # Performance optimizations + wait_until="domcontentloaded", # Faster than networkidle + cache_mode=CacheMode.ENABLED # Enable caching + ) ) -) - -# Memory-efficient streaming for large crawls -async for result in crawler.arun_stream(large_url_list): - # Process results as they complete - await process_result(result) - # Memory is freed after each iteration + results.append(result) ``` **Performance Gains:** @@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list): - **Memory Usage**: 60% reduction with streaming processing - **Concurrent Crawls**: Handle 5x more parallel requests -## πŸ“„ PDF Support - -PDF extraction is now natively supported in Crawl4AI. - -```python -# Extract data from PDF documents -result = await crawler.arun( - "https://example.com/report.pdf", - config=CrawlerRunConfig( - pdf_extraction=True, - extraction_strategy=JsonCssExtractionStrategy({ - # Works on converted PDF structure - "title": {"selector": "h1", "type": "text"}, - "sections": {"selector": "h2", "type": "list"} - }) - ) -) -``` ## πŸ”§ Important Changes diff --git a/docs/md_v2/advanced/virtual-scroll.md b/docs/md_v2/advanced/virtual-scroll.md index 0b1a8f88..271a9564 100644 --- a/docs/md_v2/advanced/virtual-scroll.md +++ b/docs/md_v2/advanced/virtual-scroll.md @@ -91,13 +91,12 @@ async def crawl_twitter_timeline(): wait_after_scroll=1.0 # Twitter needs time to load ) + browser_config = BrowserConfig(headless=True) # Set to False to watch it work config = CrawlerRunConfig( - virtual_scroll_config=virtual_config, - # Optional: Set headless=False to watch it work - # browser_config=BrowserConfig(headless=False) + virtual_scroll_config=virtual_config ) - async with AsyncWebCrawler() as crawler: + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://twitter.com/search?q=AI", config=config @@ -200,7 +199,7 @@ Use **scan_full_page** when: Virtual Scroll works seamlessly with extraction strategies: ```python -from crawl4ai import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy, LLMConfig # Define extraction schema schema = { @@ -222,7 +221,7 @@ config = CrawlerRunConfig( scroll_count=20 ), extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o-mini", + llm_config=LLMConfig(provider="openai/gpt-4o-mini"), schema=schema ) ) diff --git a/docs/md_v2/blog/releases/0.7.0.md b/docs/md_v2/blog/releases/0.7.0.md index 6bbf670a..0772ae58 100644 --- a/docs/md_v2/blog/releases/0.7.0.md +++ b/docs/md_v2/blog/releases/0.7.0.md @@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0β€”the Adaptive Intelligence Update. This rel - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages -- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization +- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering -- **PDF Parsing**: Extract data from PDF documents - **Performance Optimizations**: Significant speed and memory improvements ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning @@ -145,29 +144,17 @@ async with AsyncWebCrawler() as crawler: ### The Three-Layer Scoring System ```python -from crawl4ai import LinkPreviewConfig +from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode # Configure intelligent link analysis link_config = LinkPreviewConfig( - # What to analyze include_internal=True, - include_external=True, - max_links=100, # Analyze top 100 links - - # Relevance scoring - query="machine learning tutorials", # Your interest - score_threshold=0.3, # Minimum relevance score - - # Performance - concurrent_requests=10, # Parallel processing - timeout_per_link=5000, # 5s per link - - # Advanced scoring weights - scoring_weights={ - "intrinsic": 0.3, # Link quality indicators - "contextual": 0.5, # Relevance to query - "popularity": 0.2 # Link prominence - } + include_external=False, + max_links=10, + concurrency=5, + query="python tutorial", # For contextual scoring + score_threshold=0.3, + verbose=True ) # Use in your crawl @@ -175,35 +162,51 @@ result = await crawler.arun( "https://tech-blog.example.com", config=CrawlerRunConfig( link_preview_config=link_config, - score_links=True + score_links=True, # Enable intrinsic scoring + cache_mode=CacheMode.BYPASS ) ) # Access scored and sorted links -for link in result.links["internal"][:10]: # Top 10 internal links - print(f"Score: {link['total_score']:.3f}") - print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes - print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query - print(f" URL: {link['href']}") - print(f" Title: {link['head_data']['title']}") - print(f" Description: {link['head_data']['meta']['description'][:100]}...") +if result.success and result.links: +# Get scored links +internal_links = result.links.get("internal", []) +scored_links = [l for l in internal_links if l.get("total_score")] +scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True) + +# Create a scoring table +table = Table(title="Link Scoring Results", box=box.ROUNDED) +table.add_column("Link Text", style="cyan", width=40) +table.add_column("Intrinsic Score", justify="center") +table.add_column("Contextual Score", justify="center") +table.add_column("Total Score", justify="center", style="bold green") + +for link in scored_links[:5]: + text = link.get('text', 'No text')[:40] + table.add_row( + text, + f"{link.get('intrinsic_score', 0):.1f}/10", + f"{link.get('contextual_score', 0):.2f}/1", + f"{link.get('total_score', 0):.3f}" + ) + +console.print(table) ``` **Scoring Components:** -1. **Intrinsic Score (0-10)**: Based on link quality indicators +1. **Intrinsic Score**: Based on link quality indicators - Position on page (navigation, content, footer) - Link attributes (rel, title, class names) - Anchor text quality and length - URL structure and depth -2. **Contextual Score (0-1)**: Relevance to your query - - Semantic similarity using embeddings +2. **Contextual Score**: Relevance to your query using BM25 algorithm - Keyword matching in link text and title - Meta description analysis - Content preview scoring -3. **Total Score**: Weighted combination for final ranking +3. **Total Score**: Combined score for final ranking **Expected Real-World Impact:** - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links @@ -225,53 +228,53 @@ from crawl4ai import AsyncUrlSeeder, SeedingConfig # Basic discovery - find all product pages seeder_config = SeedingConfig( # Discovery sources - source="sitemap+cc", # Sitemap + Common Crawl + source="cc+sitemap", # Sitemap + Common Crawl # Filtering pattern="*/product/*", # URL pattern matching - ignore_patterns=["*/reviews/*", "*/questions/*"], # Validation live_check=True, # Verify URLs are alive - max_urls=5000, # Stop at 5000 URLs + max_urls=50, # Stop at 50 URLs # Performance - concurrency=100, # Parallel requests - hits_per_sec=10 # Rate limiting + concurrency=100, # Maximum concurrent requests for live checks/head extraction + hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers ) -seeder = AsyncUrlSeeder(seeder_config) -urls = await seeder.discover("https://shop.example.com") +async with AsyncUrlSeeder() as seeder: + console.print("Discovering URLs from Python docs...") + urls = await seeder.urls("docs.python.org", seeding_config) + console.print(f"\nβœ“ Discovered {len(urls)} URLs") # Advanced: Relevance-based discovery research_config = SeedingConfig( - source="crawl+sitemap", # Deep crawl + sitemap + source="sitemap+cc", # Sitemap + Common Crawl pattern="*/blog/*", # Blog posts only # Content relevance extract_head=True, # Get meta tags query="quantum computing tutorials", - scoring_method="bm25", # Or "semantic" (coming soon) + scoring_method="bm25", # BM25 scoring method score_threshold=0.4, # High relevance only # Smart filtering filter_nonsense_urls=True, # Remove .xml, .txt, etc. - min_content_length=500, # Skip thin content force=True # Bypass cache ) # Discover with progress tracking discovered = [] -async for batch in seeder.discover_iter("https://physics-blog.com", research_config): - discovered.extend(batch) - print(f"Found {len(discovered)} relevant URLs so far...") +async with AsyncUrlSeeder() as seeder: + discovered = await seeder.urls("https://physics-blog.com", research_config) + console.print(f"\nβœ“ Discovered {len(discovered)} URLs") # Results include scores and metadata for url_data in discovered[:5]: print(f"URL: {url_data['url']}") - print(f"Score: {url_data['score']:.3f}") - print(f"Title: {url_data['title']}") + print(f"Score: {url_data['relevance_score']:.3f}") + print(f"Title: {url_data['head_data']['title']}") ``` **Discovery Methods:** @@ -294,35 +297,18 @@ This release includes significant performance improvements through optimized res ### What We Optimized ```python -# Before v0.7.0 (slow) +# Optimized crawling with v0.7.0 improvements results = [] for url in urls: - result = await crawler.arun(url) - results.append(result) - -# After v0.7.0 (fast) -# Automatic batching and connection pooling -results = await crawler.arun_batch( - urls, - config=CrawlerRunConfig( - # New performance options - batch_size=10, # Process 10 URLs concurrently - reuse_browser=True, # Keep browser warm - eager_loading=False, # Load only what's needed - streaming_extraction=True, # Stream large extractions - - # Optimized defaults - wait_until="domcontentloaded", # Faster than networkidle - exclude_external_resources=True, # Skip third-party assets - block_ads=True # Ad blocking built-in + result = await crawler.arun( + url, + config=CrawlerRunConfig( + # Performance optimizations + wait_until="domcontentloaded", # Faster than networkidle + cache_mode=CacheMode.ENABLED # Enable caching + ) ) -) - -# Memory-efficient streaming for large crawls -async for result in crawler.arun_stream(large_url_list): - # Process results as they complete - await process_result(result) - # Memory is freed after each iteration + results.append(result) ``` **Performance Gains:** @@ -332,24 +318,6 @@ async for result in crawler.arun_stream(large_url_list): - **Memory Usage**: 60% reduction with streaming processing - **Concurrent Crawls**: Handle 5x more parallel requests -## πŸ“„ PDF Support - -PDF extraction is now natively supported in Crawl4AI. - -```python -# Extract data from PDF documents -result = await crawler.arun( - "https://example.com/report.pdf", - config=CrawlerRunConfig( - pdf_extraction=True, - extraction_strategy=JsonCssExtractionStrategy({ - # Works on converted PDF structure - "title": {"selector": "h1", "type": "text"}, - "sections": {"selector": "h2", "type": "list"} - }) - ) -) -``` ## πŸ”§ Important Changes diff --git a/docs/md_v2/core/adaptive-crawling.md b/docs/md_v2/core/adaptive-crawling.md index 72e11937..ea1674c2 100644 --- a/docs/md_v2/core/adaptive-crawling.md +++ b/docs/md_v2/core/adaptive-crawling.md @@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler async def main(): async with AsyncWebCrawler() as crawler: - # Create an adaptive crawler + # Create an adaptive crawler (config is optional) adaptive = AdaptiveCrawler(crawler) # Start crawling with a query @@ -59,13 +59,13 @@ async def main(): from crawl4ai import AdaptiveConfig config = AdaptiveConfig( - confidence_threshold=0.7, # Stop when 70% confident (default: 0.8) - max_pages=20, # Maximum pages to crawl (default: 50) - top_k_links=3, # Links to follow per page (default: 5) + confidence_threshold=0.8, # Stop when 80% confident (default: 0.7) + max_pages=30, # Maximum pages to crawl (default: 20) + top_k_links=5, # Links to follow per page (default: 3) min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1) ) -adaptive = AdaptiveCrawler(crawler, config=config) +adaptive = AdaptiveCrawler(crawler, config) ``` ## Crawling Strategies @@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False): The confidence score (0-1) indicates how sufficient the gathered information is: - **0.0-0.3**: Insufficient information, needs more crawling - **0.3-0.6**: Partial information, may answer basic queries -- **0.6-0.8**: Good coverage, can answer most queries -- **0.8-1.0**: Excellent coverage, comprehensive information +- **0.6-0.7**: Good coverage, can answer most queries +- **0.7-1.0**: Excellent coverage, comprehensive information ### Statistics Display @@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl") - Avoid overly broad queries ### 2. Threshold Tuning -- Start with default (0.8) for general use -- Lower to 0.6-0.7 for exploratory crawling -- Raise to 0.9+ for exhaustive coverage +- Start with default (0.7) for general use +- Lower to 0.5-0.6 for exploratory crawling +- Raise to 0.8+ for exhaustive coverage ### 3. Performance Optimization - Use appropriate `max_pages` limits diff --git a/docs/md_v2/core/url-seeding.md b/docs/md_v2/core/url-seeding.md index 24cdfa46..f891c204 100644 --- a/docs/md_v2/core/url-seeding.md +++ b/docs/md_v2/core/url-seeding.md @@ -137,7 +137,7 @@ async def smart_blog_crawler(): word_count_threshold=300 # Only substantial articles ) - # Extract URLs and stream results as they come + # Extract URLs and crawl them tutorial_urls = [t["url"] for t in tutorials[:10]] results = await crawler.arun_many(tutorial_urls, config=config) @@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I ```python # Use both sources -config = SeedingConfig(source="cc+sitemap") +config = SeedingConfig(source="sitemap+cc") urls = await seeder.urls("example.com", config) ``` @@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" | +| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" | | `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") | | `extract_head` | bool | False | Extract metadata from page `` | | `live_check` | bool | False | Verify URLs are accessible | | `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) | | `concurrency` | int | 10 | Parallel workers for fetching | -| `hits_per_sec` | int | None | Rate limit for requests | +| `hits_per_sec` | int | 5 | Rate limit for requests | | `force` | bool | False | Bypass cache, fetch fresh data | | `verbose` | bool | False | Show detailed progress | | `query` | str | None | Search query for BM25 scoring | @@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config) ```python # Find specific products config = SeedingConfig( - source="cc+sitemap", # Use both sources + source="sitemap+cc", # Use both sources extract_head=True, query="wireless headphones noise canceling", scoring_method="bm25", @@ -782,7 +782,7 @@ class ResearchAssistant: # Step 1: Discover relevant URLs config = SeedingConfig( - source="cc+sitemap", # Maximum coverage + source="sitemap+cc", # Maximum coverage extract_head=True, # Get metadata query=topic, # Research topic scoring_method="bm25", # Smart scoring @@ -832,7 +832,8 @@ class ResearchAssistant: # Extract URLs and crawl all articles article_urls = [article['url'] for article in top_articles] results = [] - async for result in await crawler.arun_many(article_urls, config=config): + crawl_results = await crawler.arun_many(article_urls, config=config) + async for result in crawl_results: if result.success: results.append({ 'url': result.url, @@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5) # When crawling many URLs async with AsyncWebCrawler() as crawler: # Assuming urls is a list of URL strings - results = await crawler.arun_many(urls, config=config) + crawl_results = await crawler.arun_many(urls, config=config) # Process as they arrive - async for result in results: + async for result in crawl_results: process_immediately(result) # Don't wait for all ``` @@ -1020,7 +1021,7 @@ config = SeedingConfig( # E-commerce product discovery config = SeedingConfig( - source="cc+sitemap", + source="sitemap+cc", pattern="*/product/*", extract_head=True, live_check=True