From 16b8d4945b831dc06130bd2b1c698f33c4c31d01 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 21 Jan 2025 21:03:11 +0800 Subject: [PATCH] feat(release): prepare v0.4.3 beta release Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit --- CHANGELOG.md | 150 ++++++++-- crawl4ai/__init__.py | 5 +- crawl4ai/__version__.py | 2 +- docs/examples/dispatcher_example.py | 3 +- docs/examples/llm_markdown_generator.py | 87 ++++++ .../scraping_strategies_performance.py | 135 +++++++++ docs/examples/v0_4_3_features_demo.py | 252 +++++++++++++++++ .../md_v2/advanced/multi-url-crawling copy.md | 264 ----------------- docs/md_v2/advanced/multi-url-crawling.md | 4 +- docs/md_v2/blog/releases/v0.4.3b1.md | 266 ++++++++++++++++++ docs/md_v2/core/markdown-generation.md | 2 +- pyproject.toml | 2 +- 12 files changed, 885 insertions(+), 287 deletions(-) create mode 100644 docs/examples/llm_markdown_generator.py create mode 100644 docs/examples/scraping_strategies_performance.py create mode 100644 docs/examples/v0_4_3_features_demo.py delete mode 100644 docs/md_v2/advanced/multi-url-crawling copy.md create mode 100644 docs/md_v2/blog/releases/v0.4.3b1.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c790f02..d62d8775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,19 +1,3 @@ -### [Added] 2025-01-21 -- Added robots.txt compliance support with efficient SQLite-based caching -- New `check_robots_txt` parameter in CrawlerRunConfig to enable robots.txt checking -- Documentation updates for robots.txt compliance features and examples -- Automated robots.txt checking integrated into AsyncWebCrawler with 403 status codes for blocked URLs - -### [Added] 2025-01-20 -- Added proxy configuration support to CrawlerRunConfig allowing dynamic proxy settings per crawl request -- Updated documentation with examples for using proxy configuration in crawl operations - -### [Added] 2025-01-20 -- New LLM-powered schema generation utility for JsonElementExtractionStrategy -- Support for automatic CSS and XPath schema generation using OpenAI or Ollama -- Comprehensive documentation and examples for schema generation -- New prompt templates optimized for HTML schema analysis - # Changelog All notable changes to Crawl4AI will be documented in this file. @@ -21,6 +5,140 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +Okay, here's a detailed changelog in Markdown format, generated from the provided git diff and commit history. I've focused on user-facing changes, fixes, and features, and grouped them as requested: + +## Version 0.4.3 (2025-01-21) + +This release introduces several powerful new features, including robots.txt compliance, dynamic proxy support, LLM-powered schema generation, and improved documentation. + +### Features + +- **Robots.txt Compliance:** + - Added robots.txt compliance support with efficient SQLite-based caching. + - New `check_robots_txt` parameter in `CrawlerRunConfig` to enable robots.txt checking before crawling a URL. + - Automated robots.txt checking is now integrated into `AsyncWebCrawler` with 403 status codes for blocked URLs. + +- **Proxy Configuration:** + - Added proxy configuration support to `CrawlerRunConfig`, allowing dynamic proxy settings per crawl request. + - Updated documentation with examples for using proxy configuration in crawl operations. + +- **LLM-Powered Schema Generation:** + - Introduced a new utility for automatic CSS and XPath schema generation using OpenAI or Ollama models. + - Added comprehensive documentation and examples for schema generation. + - New prompt templates optimized for HTML schema analysis. + +- **URL Redirection Tracking:** + - Added URL redirection tracking to capture the final URL after any redirects. + - The final URL is now available in the `final_url` field of the `AsyncCrawlResponse` object. + +- **Enhanced Streamlined Documentation:** + - Refactored and improved the documentation structure for clarity and ease of use. + - Added detailed explanations of new features and updated examples. + +- **Improved Browser Context Management:** + - Enhanced the management of browser contexts and added shared data support. + - Introduced the `shared_data` parameter in `CrawlerRunConfig` to pass data between hooks. + +- **Memory Dispatcher System:** + - Migrated to a memory dispatcher system with enhanced monitoring capabilities. + - Introduced `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher` for improved resource management. + - Added `RateLimiter` for rate limiting support. + - New `CrawlerMonitor` for real-time monitoring of crawler operations. + +- **Streaming Support:** + - Added streaming support for processing crawled URLs as they are processed. + - Enabled streaming mode with the `stream` parameter in `CrawlerRunConfig`. + +- **Content Scraping Strategy:** + - Introduced a new `LXMLWebScrapingStrategy` for faster content scraping. + - Added support for selecting the scraping strategy via the `scraping_strategy` parameter in `CrawlerRunConfig`. + +### Bug Fixes + +- **Browser Path Management:** + - Improved browser path management for consistent behavior across different environments. + +- **Memory Threshold:** + - Adjusted the default memory threshold to improve resource utilization. + +- **Pydantic Model Fields:** + - Made several model fields optional with default values to improve flexibility. + +### Refactor + +- **Documentation Structure:** + - Reorganized documentation structure to improve navigation and readability. + - Updated styles and added new sections for advanced features. + +- **Scraping Mode:** + - Replaced the `ScrapingMode` enum with a strategy pattern for more flexible content scraping. + +- **Version Update:** + - Updated the version to `0.4.248`. + +- **Code Cleanup:** + - Removed unused files and improved type hints. + - Applied Ruff corrections for code quality. + +- **Updated dependencies:** + - Updated dependencies to their latest versions to ensure compatibility and security. + +- **Ignored certain patterns and directories:** + - Updated `.gitignore` and `.codeiumignore` to ignore additional patterns and directories, streamlining the development environment. + +- **Simplified Personal Story in README:** + - Streamlined the personal story and project vision in the `README.md` for clarity. + +- **Removed Deprecated Files:** + - Deleted several deprecated files and examples that are no longer relevant. + +--- +**Previous Releases:** + +### 0.4.24x (2024-12-31) +- **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling. +- **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies. +- **Improved JSON Extraction**: Support for complex JSONPath, JSON-CSS, and Microdata extraction. +- **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types. +- **Performance Boost**: Optimized caching, parallel processing, and memory management. +- **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking. +- **Security Features**: Improved input validation and safe expression evaluation. + +### 0.4.247 (2025-01-06) + +#### Added +- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) +- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +#### Changed +- **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py)) +- **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) +- **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py)) +- **Documentation Update**: + - Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md)) + +#### Removed +- **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py)) + +#### Fixed +- **Page Closing to Prevent Memory Leaks**: + - **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided. + - **Impact**: Prevents memory leaks caused by lingering pages after a crawl. + - **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py) + - **Code**: + ```python + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() + ``` +- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py)) +- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py)) + +#### Other +- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore)) + + ## [0.4.24] - 2024-12-31 ### Added diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index beda64f8..482afdd7 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -12,10 +12,11 @@ from .extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy, + JsonXPathExtractionStrategy ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator -from .content_filter_strategy import PruningContentFilter, BM25ContentFilter +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter from .models import CrawlResult, MarkdownGenerationResult from .async_dispatcher import ( MemoryAdaptiveDispatcher, @@ -39,11 +40,13 @@ __all__ = [ "LLMExtractionStrategy", "CosineStrategy", "JsonCssExtractionStrategy", + "JsonXPathExtractionStrategy", "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", "PruningContentFilter", "BM25ContentFilter", + "LLMContentFilter", "BaseDispatcher", "MemoryAdaptiveDispatcher", "SemaphoreDispatcher", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index ea8194f4..5d2b86af 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.248" +__version__ = "0.4.3b1" diff --git a/docs/examples/dispatcher_example.py b/docs/examples/dispatcher_example.py index c9708ccc..ae6406bb 100644 --- a/docs/examples/dispatcher_example.py +++ b/docs/examples/dispatcher_example.py @@ -12,6 +12,7 @@ from crawl4ai import ( CrawlerMonitor, DisplayMode, CacheMode, + LXMLWebScrapingStrategy, ) @@ -113,7 +114,7 @@ def create_performance_table(results): async def main(): urls = [f"https://example.com/page{i}" for i in range(1, 20)] browser_config = BrowserConfig(headless=True, verbose=False) - run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy()) results = { "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config), diff --git a/docs/examples/llm_markdown_generator.py b/docs/examples/llm_markdown_generator.py new file mode 100644 index 00000000..60b8549d --- /dev/null +++ b/docs/examples/llm_markdown_generator.py @@ -0,0 +1,87 @@ +import os +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.content_filter_strategy import LLMContentFilter + +async def test_llm_filter(): + # Create an HTML source that needs intelligent filtering + url = "https://docs.python.org/3/tutorial/classes.html" + + browser_config = BrowserConfig( + headless=True, + verbose=True + ) + + # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # First get the raw HTML + result = await crawler.arun(url, config=run_config) + html = result.cleaned_html + + # Initialize LLM filter with focused instruction + filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + instruction=""" + Focus on extracting the core educational content about Python classes. + Include: + - Key concepts and their explanations + - Important code examples + - Essential technical details + Exclude: + - Navigation elements + - Sidebars + - Footer content + - Version information + - Any non-essential UI elements + + Format the output as clean markdown with proper code blocks and headers. + """, + verbose=True + ) + + filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token=os.getenv('OPENAI_API_KEY'), + chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 + instruction=""" + Extract the main educational content while preserving its original wording and substance completely. Your task is to: + + 1. Maintain the exact language and terminology used in the main content + 2. Keep all technical explanations, examples, and educational content intact + 3. Preserve the original flow and structure of the core content + 4. Remove only clearly irrelevant elements like: + - Navigation menus + - Advertisement sections + - Cookie notices + - Footers with site information + - Sidebars with external links + - Any UI elements that don't contribute to learning + + The goal is to create a clean markdown version that reads exactly like the original article, + keeping all valuable content but free from distracting elements. Imagine you're creating + a perfect reading experience where nothing valuable is lost, but all noise is removed. + """, + verbose=True + ) + + # Apply filtering + filtered_content = filter.filter_content(html, ignore_cache = True) + + # Show results + print("\nFiltered Content Length:", len(filtered_content)) + print("\nFirst 500 chars of filtered content:") + if filtered_content: + print(filtered_content[0][:500]) + + # Save on disc the markdown version + with open("filtered_content.md", "w", encoding="utf-8") as f: + f.write("\n".join(filtered_content)) + + # Show token usage + filter.show_usage() + +if __name__ == "__main__": + asyncio.run(test_llm_filter()) \ No newline at end of file diff --git a/docs/examples/scraping_strategies_performance.py b/docs/examples/scraping_strategies_performance.py new file mode 100644 index 00000000..b8c80be2 --- /dev/null +++ b/docs/examples/scraping_strategies_performance.py @@ -0,0 +1,135 @@ +import time, re +from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy +import time +import functools +from collections import defaultdict + +class TimingStats: + def __init__(self): + self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0})) + + def add(self, strategy_name, func_name, elapsed): + self.stats[strategy_name][func_name]["calls"] += 1 + self.stats[strategy_name][func_name]["total_time"] += elapsed + + def report(self): + for strategy_name, funcs in self.stats.items(): + print(f"\n{strategy_name} Timing Breakdown:") + print("-" * 60) + print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}") + print("-" * 60) + + for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True): + avg_ms = (data["total_time"] / data["calls"]) * 1000 + print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}") + +timing_stats = TimingStats() + +# Modify timing decorator +def timing_decorator(strategy_name): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + start = time.time() + result = func(*args, **kwargs) + elapsed = time.time() - start + timing_stats.add(strategy_name, func.__name__, elapsed) + return result + return wrapper + return decorator + +# Modified decorator application +def apply_decorators(cls, method_name, strategy_name): + try: + original_method = getattr(cls, method_name) + decorated_method = timing_decorator(strategy_name)(original_method) + setattr(cls, method_name, decorated_method) + except AttributeError: + print(f"Method {method_name} not found in class {cls.__name__}.") + +# Apply to key methods +methods_to_profile = [ + '_scrap', + # 'process_element', + '_process_element', + 'process_image', +] + + +# Apply decorators to both strategies +for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]: + for method in methods_to_profile: + apply_decorators(strategy, method, name) + + +def generate_large_html(n_elements=1000): + html = [''] + for i in range(n_elements): + html.append(f''' +
+

Heading {i}

+
+
+

This is paragraph {i} with some content and a link

+
+
+ Image {i} + +
+ ''') + html.append('') + return ''.join(html) + +def test_scraping(): + # Initialize both scrapers + original_scraper = WebScrapingStrategy() + selected_scraper = LXMLWebScrapingStrategy() + + # Generate test HTML + print("Generating HTML...") + html = generate_large_html(5000) + print(f"HTML Size: {len(html)/1024:.2f} KB") + + # Time the scraping + print("\nStarting scrape...") + start_time = time.time() + + kwargs = { + "url": "http://example.com", + "html": html, + "word_count_threshold": 5, + "keep_data_attributes": True + } + + t1 = time.perf_counter() + result_selected = selected_scraper.scrap(**kwargs) + t2 = time.perf_counter() + + result_original = original_scraper.scrap(**kwargs) + t3 = time.perf_counter() + + elapsed = t3 - start_time + print(f"\nScraping completed in {elapsed:.2f} seconds") + + timing_stats.report() + + # Print stats of LXML output + print("\nLXML Output:") + print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}") + print(f"Extracted images: {len(result_selected['media']['images'])}") + print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB") + print(f"Scraping time: {t2 - t1:.2f} seconds") + + # Print stats of original output + print("\nOriginal Output:") + print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}") + print(f"Extracted images: {len(result_original['media']['images'])}") + print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB") + print(f"Scraping time: {t3 - t1:.2f} seconds") + + +if __name__ == "__main__": + test_scraping() \ No newline at end of file diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py new file mode 100644 index 00000000..2ffaa172 --- /dev/null +++ b/docs/examples/v0_4_3_features_demo.py @@ -0,0 +1,252 @@ +""" +Crawl4ai v0.4.3 Features Demo +============================ + +This example demonstrates the major new features introduced in Crawl4ai v0.4.3. +Each section showcases a specific feature with practical examples and explanations. +""" + +import asyncio +import os +from crawl4ai import * + + +async def demo_memory_dispatcher(): + """ + 1. Memory Dispatcher System Demo + =============================== + Shows how to use the new memory dispatcher with monitoring + """ + print("\n=== 1. Memory Dispatcher System Demo ===") + + # Configure crawler + browser_config = BrowserConfig(headless=True, verbose=True) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator() + ) + + # Test URLs + urls = ["http://example.com", "http://example.org", "http://example.net"] * 3 + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Initialize dispatcher with monitoring + monitor = CrawlerMonitor( + max_visible_rows=10, + display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, # Memory usage threshold + check_interval=0.5, # How often to check memory + max_session_permit=5, # Max concurrent crawls + monitor=monitor, # Pass the monitor + ) + + # Run with memory monitoring + print("Starting batch crawl with memory monitoring...") + results = await dispatcher.run_urls( + urls=urls, + crawler=crawler, + config=crawler_config, + ) + print(f"Completed {len(results)} URLs") + + +async def demo_streaming_support(): + """ + 2. Streaming Support Demo + ====================== + Shows how to process URLs as they complete using streaming + """ + print("\n=== 2. Streaming Support Demo ===") + + browser_config = BrowserConfig(headless=True, verbose=True) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True) + + # Test URLs + urls = ["http://example.com", "http://example.org", "http://example.net"] * 2 + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Initialize dispatcher for streaming + dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5) + + print("Starting streaming crawl...") + async for result in dispatcher.run_urls_stream( + urls=urls, crawler=crawler, config=crawler_config + ): + # Process each result as it arrives + print( + f"Received result for {result.url} - Success: {result.result.success}" + ) + if result.result.success: + print(f"Content length: {len(result.result.markdown)}") + + +async def demo_content_scraping(): + """ + 3. Content Scraping Strategy Demo + ============================== + Demonstrates the new LXMLWebScrapingStrategy for faster content scraping. + """ + print("\n=== 3. Content Scraping Strategy Demo ===") + + crawler = AsyncWebCrawler() + url = "https://example.com/article" + + # Configure with the new LXML strategy + config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True) + + print("Scraping content with LXML strategy...") + async with crawler: + result = await crawler.arun(url, config=config) + if result.success: + print("Successfully scraped content using LXML strategy") + + +async def demo_llm_markdown(): + """ + 4. LLM-Powered Markdown Generation Demo + =================================== + Shows how to use the new LLM-powered content filtering and markdown generation. + """ + print("\n=== 4. LLM-Powered Markdown Generation Demo ===") + + crawler = AsyncWebCrawler() + url = "https://docs.python.org/3/tutorial/classes.html" + + content_filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token=os.getenv("OPENAI_API_KEY"), + instruction=""" + Focus on extracting the core educational content about Python classes. + Include: + - Key concepts and their explanations + - Important code examples + - Essential technical details + Exclude: + - Navigation elements + - Sidebars + - Footer content + - Version information + - Any non-essential UI elements + + Format the output as clean markdown with proper code blocks and headers. + """, + verbose=True, + ) + + # Configure LLM-powered markdown generation + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=content_filter + ), + cache_mode = CacheMode.BYPASS, + verbose=True + ) + + print("Generating focused markdown with LLM...") + async with crawler: + result = await crawler.arun(url, config=config) + if result.success and result.markdown_v2: + print("Successfully generated LLM-filtered markdown") + print("First 500 chars of filtered content:") + print(result.markdown_v2.fit_markdown[:500]) + print("Successfully generated LLM-filtered markdown") + + +async def demo_robots_compliance(): + """ + 5. Robots.txt Compliance Demo + ========================== + Demonstrates the new robots.txt compliance feature with SQLite caching. + """ + print("\n=== 5. Robots.txt Compliance Demo ===") + + crawler = AsyncWebCrawler() + urls = ["https://example.com", "https://facebook.com", "https://twitter.com"] + + # Enable robots.txt checking + config = CrawlerRunConfig(check_robots_txt=True, verbose=True) + + print("Crawling with robots.txt compliance...") + async with crawler: + results = await crawler.arun_many(urls, config=config) + for result in results: + if result.status_code == 403: + print(f"Access blocked by robots.txt: {result.url}") + elif result.success: + print(f"Successfully crawled: {result.url}") + + + +async def demo_llm_schema_generation(): + """ + 7. LLM-Powered Schema Generation Demo + ================================= + Demonstrates automatic CSS and XPath schema generation using LLM models. + """ + print("\n=== 7. LLM-Powered Schema Generation Demo ===") + + # Example HTML content for a job listing + html_content = """ +
+

Senior Software Engineer

+
+ San Francisco, CA + $150,000 - $200,000 +
+

Requirements

+
    +
  • 5+ years Python experience
  • +
  • Strong background in web crawling
  • +
+
+
+
+ """ + + print("Generating CSS selectors schema...") + # Generate CSS selectors with a specific query + css_schema = JsonCssExtractionStrategy.generate_schema( + html_content, + schema_type="CSS", + query="Extract job title, location, and salary information", + provider="openai/gpt-4o", # or use other providers like "ollama" + ) + print("\nGenerated CSS Schema:") + print(css_schema) + + # Example of using the generated schema with crawler + crawler = AsyncWebCrawler() + url = "https://example.com/job-listing" + + # Create an extraction strategy with the generated schema + extraction_strategy = JsonCssExtractionStrategy(schema=css_schema) + + config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True) + + print("\nTesting generated schema with crawler...") + async with crawler: + result = await crawler.arun(url, config=config) + if result.success: + print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None) + print("Successfully used generated schema for crawling") + + +async def main(): + """Run all feature demonstrations.""" + demo_memory_dispatcher(), + print("\n" + "=" * 50 + "\n") + demo_streaming_support(), + print("\n" + "=" * 50 + "\n") + demo_content_scraping(), + print("\n" + "=" * 50 + "\n") + demo_llm_schema_generation(), + print("\n" + "=" * 50 + "\n") + demo_llm_markdown(), + print("\n" + "=" * 50 + "\n") + demo_robots_compliance(), + print("\n" + "=" * 50 + "\n") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/md_v2/advanced/multi-url-crawling copy.md b/docs/md_v2/advanced/multi-url-crawling copy.md deleted file mode 100644 index a1d2b423..00000000 --- a/docs/md_v2/advanced/multi-url-crawling copy.md +++ /dev/null @@ -1,264 +0,0 @@ -# Optimized Multi-URL Crawling - -> **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated. - - -Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to: - -1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation. -2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser. - -When the entire process finishes, you close the browser once—**minimizing** memory and resource usage. - ---- - -## 1. Why Avoid Simple Loops per URL? - -If you naively do: - -```python -for url in urls: - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url) -``` - -You end up: - -1. Spinning up a **new** browser for each URL -2. Closing it immediately after the single crawl -3. Potentially using a lot of CPU/memory for short-living browsers -4. Missing out on session reusability if you have login or ongoing states - -**Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead. - ---- - -## 2. Sequential Crawling with Session Reuse - -### 2.1 Overview - -1. **One** `AsyncWebCrawler` instance for **all** URLs. -2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed. -3. The crawler is only closed at the **end**. - -**This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs). - -### 2.2 Example Code - -```python -import asyncio -from typing import List -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator - -async def crawl_sequential(urls: List[str]): - print("\n=== Sequential Crawling with Session Reuse ===") - - browser_config = BrowserConfig( - headless=True, - # For better performance in Docker or low-memory environments: - extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], - ) - - crawl_config = CrawlerRunConfig( - markdown_generator=DefaultMarkdownGenerator() - ) - - # Create the crawler (opens the browser) - crawler = AsyncWebCrawler(config=browser_config) - await crawler.start() - - try: - session_id = "session1" # Reuse the same session across all URLs - for url in urls: - result = await crawler.arun( - url=url, - config=crawl_config, - session_id=session_id - ) - if result.success: - print(f"Successfully crawled: {url}") - # E.g. check markdown length - print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}") - else: - print(f"Failed: {url} - Error: {result.error_message}") - finally: - # After all URLs are done, close the crawler (and the browser) - await crawler.close() - -async def main(): - urls = [ - "https://example.com/page1", - "https://example.com/page2", - "https://example.com/page3" - ] - await crawl_sequential(urls) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -**Why It’s Good**: - -- **One** browser launch. -- Minimal memory usage. -- If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs. - ---- - -## 3. Parallel Crawling with Browser Reuse - -### 3.1 Overview - -To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task. - -### 3.2 Example Code - -For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package. - -```bash -pip install psutil -``` - -Then you can run the following code: - -```python -import os -import sys -import psutil -import asyncio - -__location__ = os.path.dirname(os.path.abspath(__file__)) -__output__ = os.path.join(__location__, "output") - -# Append parent directory to system path -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - -from typing import List -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode - -async def crawl_parallel(urls: List[str], max_concurrent: int = 3): - print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===") - - # We'll keep track of peak memory usage across all tasks - peak_memory = 0 - process = psutil.Process(os.getpid()) - - def log_memory(prefix: str = ""): - nonlocal peak_memory - current_mem = process.memory_info().rss # in bytes - if current_mem > peak_memory: - peak_memory = current_mem - print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB") - - # Minimal browser config - browser_config = BrowserConfig( - headless=True, - verbose=False, # corrected from 'verbos=False' - extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], - ) - crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - - # Create the crawler instance - crawler = AsyncWebCrawler(config=browser_config) - await crawler.start() - - try: - # We'll chunk the URLs in batches of 'max_concurrent' - success_count = 0 - fail_count = 0 - for i in range(0, len(urls), max_concurrent): - batch = urls[i : i + max_concurrent] - tasks = [] - - for j, url in enumerate(batch): - # Unique session_id per concurrent sub-task - session_id = f"parallel_session_{i + j}" - task = crawler.arun(url=url, config=crawl_config, session_id=session_id) - tasks.append(task) - - # Check memory usage prior to launching tasks - log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ") - - # Gather results - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Check memory usage after tasks complete - log_memory(prefix=f"After batch {i//max_concurrent + 1}: ") - - # Evaluate results - for url, result in zip(batch, results): - if isinstance(result, Exception): - print(f"Error crawling {url}: {result}") - fail_count += 1 - elif result.success: - success_count += 1 - else: - fail_count += 1 - - print(f"\nSummary:") - print(f" - Successfully crawled: {success_count}") - print(f" - Failed: {fail_count}") - - finally: - print("\nClosing crawler...") - await crawler.close() - # Final memory log - log_memory(prefix="Final: ") - print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}") - -async def main(): - urls = [ - "https://example.com/page1", - "https://example.com/page2", - "https://example.com/page3", - "https://example.com/page4" - ] - await crawl_parallel(urls, max_concurrent=2) - -if __name__ == "__main__": - asyncio.run(main()) - -``` - -**Notes**: - -- We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser. -- Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired). -- We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory. - ---- - -## 4. Performance Tips - -1. **Extra Browser Args** - - `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments. - - `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems. - -2. **Session Reuse** - - If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`. - - If you want isolation (each URL fresh), create unique sessions. - -3. **Batching** - - If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`). - - Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible. - -4. **Cache** - - If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`. - - If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`. - -5. **Hooks** - - You can set up global hooks for each crawler (like to block images) or per-run if you want. - - Keep them consistent if you’re reusing sessions. - ---- - -## 5. Summary - -- **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL. -- **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs. -- **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead. -- Close the crawler once at the end, ensuring the browser is only opened/closed once. - -For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling! \ No newline at end of file diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index d9b04535..12c4f916 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -58,7 +58,7 @@ Automatically manages concurrency based on system memory usage: ```python dispatcher = MemoryAdaptiveDispatcher( - memory_threshold_percent=70.0, # Pause if memory exceeds this + memory_threshold_percent=90.0, # Pause if memory exceeds this check_interval=1.0, # How often to check memory max_session_permit=10, # Maximum concurrent tasks rate_limiter=RateLimiter( # Optional rate limiting @@ -79,7 +79,7 @@ Provides simple concurrency control with a fixed limit: ```python dispatcher = SemaphoreDispatcher( - semaphore_count=5, # Fixed concurrent tasks + max_session_permit=5, # Fixed concurrent tasks rate_limiter=RateLimiter( # Optional rate limiting base_delay=(0.5, 1.0), max_delay=10.0 diff --git a/docs/md_v2/blog/releases/v0.4.3b1.md b/docs/md_v2/blog/releases/v0.4.3b1.md new file mode 100644 index 00000000..f648b462 --- /dev/null +++ b/docs/md_v2/blog/releases/v0.4.3b1.md @@ -0,0 +1,266 @@ +# Crawl4AI 0.4.3b1 is Here: Faster, Smarter, and Ready for Real-World Crawling! + +Hey, Crawl4AI enthusiasts! We're thrilled to announce the release of **Crawl4AI 0.4.3b1**, packed with powerful new features and enhancements that take web crawling to a whole new level of efficiency and intelligence. This release is all about giving you more control, better performance, and deeper insights into your crawled data. + +Let's dive into what's new! + +## 🚀 Major Feature Highlights + +### 1. LLM-Powered Schema Generation: Zero to Structured Data in Seconds! + +Tired of manually crafting CSS or XPath selectors? We've got you covered! Crawl4AI now features a revolutionary **schema generator** that uses the power of Large Language Models (LLMs) to automatically create extraction schemas for you. + +**How it Works:** + +1. **Provide HTML**: Feed in a sample HTML snippet that contains the type of data you want to extract (e.g., product listings, article sections). +2. **Describe Your Needs (Optional)**: You can provide a natural language query like "extract all product names and prices" to guide the schema creation. +3. **Choose Your LLM**: Use either **OpenAI** (GPT-4o recommended) for top-tier accuracy or **Ollama** for a local, open-source option. +4. **Get Your Schema**: The tool outputs a ready-to-use JSON schema that works seamlessly with `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`. + +**Why You'll Love It:** + +- **No More Tedious Selector Writing**: Let the LLM analyze the HTML and create the selectors for you! +- **One-Time Cost**: Schema generation uses LLM, but once you have your schema, subsequent extractions are fast and LLM-free. +- **Handles Complex Structures**: The LLM can understand nested elements, lists, and variations in layout—far beyond what simple CSS selectors can achieve. +- **Learn by Example**: The generated schemas are a fantastic way to learn best practices for writing your own schemas. + +**Example:** + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# Sample HTML snippet (imagine this is part of a product listing page) +html = """ +
+

Awesome Gadget

+ $99.99 +
+""" + +# Generate schema using OpenAI +schema = JsonCssExtractionStrategy.generate_schema( + html, + llm_provider="openai/gpt-4o", + api_token="YOUR_API_TOKEN" +) + +# Or use Ollama for a local, open-source option +# schema = JsonCssExtractionStrategy.generate_schema( +# html, +# llm_provider="ollama/llama3" +# ) + +print(json.dumps(schema, indent=2)) +``` + +**Output (Schema):** + +```json +{ + "name": null, + "baseSelector": "div.product", + "fields": [ + { + "name": "name", + "selector": "h2.name", + "type": "text" + }, + { + "name": "price", + "selector": "span.price", + "type": "text" + } + ] +} +``` + +You can now **save** this schema and use it for all your extractions on pages with the same structure. No more LLM costs, just **fast, reliable** data extraction! + +### 2. Robots.txt Compliance: Crawl Responsibly + +Crawl4AI now respects website rules! With the new `check_robots_txt=True` option in `CrawlerRunConfig`, the crawler automatically fetches, parses, and obeys each site's `robots.txt` file. + +**Key Features**: + +- **Efficient Caching**: Stores parsed `robots.txt` files locally for 7 days to avoid re-fetching. +- **Automatic Integration**: Works seamlessly with both `arun()` and `arun_many()`. +- **Clear Status Codes**: Returns a 403 status code if a URL is disallowed. +- **Customizable**: Adjust the cache directory and TTL if needed. + +**Example**: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + check_robots_txt=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/private-page", config=config) + if result.status_code == 403: + print("Access denied by robots.txt") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### 3. Proxy Support in `CrawlerRunConfig` + +Need more control over your proxy settings? Now you can configure proxies directly within `CrawlerRunConfig` for each crawl: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + config = CrawlerRunConfig( + proxy_config={ + "server": "http://your-proxy.com:8080", + "username": "your_username", # Optional + "password": "your_password" # Optional + } + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) +``` + +This allows for dynamic proxy assignment per URL or even per request. + +### 4. LLM-Powered Markdown Filtering (Beta) + +We're introducing an experimental **`LLMContentFilter`**! This filter, when used with the `DefaultMarkdownGenerator`, can produce highly focused markdown output by using an LLM to analyze content relevance. + +**How it Works:** + +1. You provide an **instruction** (e.g., "extract only the key technical details"). +2. The LLM analyzes each section of the page based on your instruction. +3. Only the most relevant content is included in the final `fit_markdown`. + +**Example**: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import LLMContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + llm_filter = LLMContentFilter( + provider="openai/gpt-4o", + api_token="YOUR_API_TOKEN", # Or use "ollama/llama3" with no token + instruction="Extract the core educational content about Python classes." + ) + + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator(content_filter=llm_filter) + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://docs.python.org/3/tutorial/classes.html", + config=config + ) + print(result.markdown_v2.fit_markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Note**: This is a beta feature. We're actively working on improving its accuracy and performance. + +### 5. Streamlined `arun_many()` with Dispatchers + +We've simplified concurrent crawling! `arun_many()` now intelligently handles multiple URLs, either returning a **list** of results or an **async generator** for streaming. + +**Basic Usage (Batch)**: + +```python +results = await crawler.arun_many( + urls=["https://site1.com", "https://site2.com"], + config=CrawlerRunConfig() +) + +for res in results: + print(res.url, "crawled successfully:", res.success) +``` + +**Streaming Mode**: + +```python +async for result in await crawler.arun_many( + urls=["https://site1.com", "https://site2.com"], + config=CrawlerRunConfig(stream=True) +): + print("Just finished:", result.url) + # Process each result immediately +``` + +**Advanced:** You can now customize how `arun_many` handles concurrency by passing a **dispatcher**. See [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) for details. + +### 6. Enhanced Browser Context Management + +We've improved how Crawl4AI manages browser contexts for better resource utilization and session handling. + +- **`shared_data` in `CrawlerRunConfig`**: Pass data between hooks using the `shared_data` dictionary. +- **Context Reuse**: The crawler now intelligently reuses browser contexts based on configuration, reducing overhead. + +### 7. Faster Scraping with `LXMLWebScrapingStrategy` + +Introducing a new, optional **`LXMLWebScrapingStrategy`** that can be **10-20x faster** than the default BeautifulSoup approach for large, complex pages. + +**How to Use**: + +```python +from crawl4ai import LXMLWebScrapingStrategy + +config = CrawlerRunConfig( + scraping_strategy=LXMLWebScrapingStrategy() # Add this line +) +``` + +**When to Use**: +- If profiling shows a bottleneck in `WebScrapingStrategy`. +- For very large HTML documents where parsing speed matters. + +**Caveats**: +- It might not handle malformed HTML as gracefully as BeautifulSoup. +- We're still gathering data, so report any issues! + +--- + +## Try the Feature Demo Script! + +We've prepared a Python script demonstrating these new features. You can find it at: + +[**`features_demo.py`**](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/0_4_3b1_feature_demo.py) + +**To run the demo:** + +1. Make sure you have Crawl4AI installed (`pip install crawl4ai`). +2. Copy the `features_demo.py` script to your local environment. +3. Set your OpenAI API key as an environment variable (if using OpenAI models): + ```bash + export OPENAI_API_KEY="your_api_key" + ``` +4. Run the script: + ```bash + python features_demo.py + ``` + +The script will execute various crawl scenarios, showcasing the new features and printing results to your console. + +## Conclusion + +Crawl4AI version 0.4.3b1 is a major step forward in flexibility, performance, and ease of use. With automatic schema generation, robots.txt handling, advanced content filtering, and streamlined multi-URL crawling, you can build powerful, efficient, and responsible web scrapers. + +We encourage you to try out these new capabilities, explore the updated documentation, and share your feedback! Your input is invaluable as we continue to improve Crawl4AI. + +**Stay Connected:** + +- **Star** us on [GitHub](https://github.com/unclecode/crawl4ai) to show your support! +- **Follow** [@unclecode](https://twitter.com/unclecode) on Twitter for updates and tips. +- **Join** our community on Discord (link coming soon) to discuss your projects and get help. + +Happy crawling! diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index 98a30652..ab8f9b05 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -181,7 +181,7 @@ from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): # Initialize LLM filter with specific instruction filter = LLMContentFilter( - provider="openai/gpt-4", # or your preferred provider + provider="openai/gpt-4o", # or your preferred provider api_token="your-api-token", # or use environment variable instruction=""" Focus on extracting the core educational content. diff --git a/pyproject.toml b/pyproject.toml index c9bd9ad3..328438e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "httpx==0.27.2", ] classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3",