From 361499d291dbfb2067ffc4efac40ac9ea9f9755c Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 29 Sep 2025 18:05:26 +0800 Subject: [PATCH] Release v0.7.5: The Update - Updated version to 0.7.5 - Added comprehensive demo and release notes - Updated documentation --- README.md | 50 ++++- crawl4ai/__version__.py | 2 +- docs/blog/release-v0.7.5.md | 238 +++++++++++++++++++++ docs/md_v2/blog/index.md | 25 ++- docs/md_v2/blog/releases/v0.7.5.md | 238 +++++++++++++++++++++ docs/releases_review/demo_v0.7.5.py | 309 ++++++++++++++++++++++++++++ 6 files changed, 850 insertions(+), 12 deletions(-) create mode 100644 docs/blog/release-v0.7.5.md create mode 100644 docs/md_v2/blog/releases/v0.7.5.md create mode 100644 docs/releases_review/demo_v0.7.5.py diff --git a/README.md b/README.md index 45f11560..58d4bf4c 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,13 @@ Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community. -[✨ Check out latest update v0.7.4](#-recent-updates) +[✨ Check out latest update v0.7.5](#-recent-updates) -✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md) +✨ New in v0.7.5: Docker Hooks System for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) -✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md) +✨ Recent v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md) + +✨ Previous v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
🤓 My Personal Story @@ -544,6 +546,48 @@ async def test_news_crawl(): ## ✨ Recent Updates +
+Version 0.7.5 Release Highlights - The Docker Hooks & Security Update + +- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions: + ```python + import requests + + # Real working hooks for httpbin.org + hooks_config = { + "on_page_context_created": """ + async def hook(page, context, **kwargs): + print("Hook: Setting up page context") + # Block images to speed up crawling + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + return page + """, + "before_goto": """ + async def hook(page, context, url, **kwargs): + print(f"Hook: About to navigate to {url}") + # Add custom headers + await page.set_extra_http_headers({'X-Test-Header': 'crawl4ai-hooks-test'}) + return page + """ + } + + # Test with Docker API + payload = { + "urls": ["https://httpbin.org/html"], + "hooks": {"code": hooks_config, "timeout": 30} + } + response = requests.post("http://localhost:11235/crawl", json=payload) + ``` + +- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration +- **🔒 HTTPS Preservation**: Secure internal link handling with `preserve_https_for_internal_links=True` +- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance +- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration + +[Full v0.7.5 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) + +
+
Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index b73a591d..550c1e08 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,7 +1,7 @@ # crawl4ai/__version__.py # This is the version that will be used for stable releases -__version__ = "0.7.4" +__version__ = "0.7.5" # For nightly builds, this gets set during build process __nightly_version__ = None diff --git a/docs/blog/release-v0.7.5.md b/docs/blog/release-v0.7.5.md new file mode 100644 index 00000000..5740873f --- /dev/null +++ b/docs/blog/release-v0.7.5.md @@ -0,0 +1,238 @@ +# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update + +*September 29, 2025 • 8 min read* + +--- + +Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements. + +## 🎯 What's New at a Glance + +- **Docker Hooks System**: Custom Python functions at key pipeline points +- **Enhanced LLM Integration**: Custom providers with temperature control +- **HTTPS Preservation**: Secure internal link handling +- **Bug Fixes**: Resolved multiple community-reported issues +- **Improved Docker Error Handling**: Better debugging and reliability + +## 🔧 Docker Hooks System: Pipeline Customization + +Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline. + +### Real Example: Authentication & Performance + +```python +import requests + +# Real working hooks for httpbin.org +hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print("Hook: Setting up page context") + # Block images to speed up crawling + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + print("Hook: Images blocked") + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print("Hook: Before retrieving HTML") + # Scroll to bottom to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + print("Hook: Scrolled to bottom") + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f"Hook: About to navigate to {url}") + # Add custom headers + await page.set_extra_http_headers({ + 'X-Test-Header': 'crawl4ai-hooks-test' + }) + return page +""" +} + +# Test with Docker API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": { + "code": hooks_config, + "timeout": 30 + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +result = response.json() + +if result.get('success'): + print("✅ Hooks executed successfully!") + print(f"Content length: {len(result.get('markdown', ''))} characters") +``` + +**Available Hook Points:** +- `on_browser_created`: Browser setup +- `on_page_context_created`: Page context configuration +- `before_goto`: Pre-navigation setup +- `after_goto`: Post-navigation processing +- `on_user_agent_updated`: User agent changes +- `on_execution_started`: Crawl initialization +- `before_retrieve_html`: Pre-extraction processing +- `before_return_html`: Final HTML processing + +## 🤖 Enhanced LLM Integration + +Enhanced LLM integration with custom providers, temperature control, and base URL configuration. + +### Multi-Provider Support + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +# Test with different providers +async def test_llm_providers(): + # OpenAI with custom temperature + openai_strategy = LLMExtractionStrategy( + provider="gemini/gemini-2.5-flash-lite", + api_token="your-api-token", + temperature=0.7, # New in v0.7.5 + instruction="Summarize this page in one sentence" + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://example.com", + config=CrawlerRunConfig(extraction_strategy=openai_strategy) + ) + + if result.success: + print("✅ LLM extraction completed") + print(result.extracted_content) + +# Docker API with enhanced LLM config +llm_payload = { + "url": "https://example.com", + "f": "llm", + "q": "Summarize this page in one sentence.", + "provider": "gemini/gemini-2.5-flash-lite", + "temperature": 0.7 +} + +response = requests.post("http://localhost:11235/md", json=llm_payload) +``` + +**New Features:** +- Custom `temperature` parameter for creativity control +- `base_url` for custom API endpoints +- Multi-provider environment variable support +- Docker API integration + +## 🔒 HTTPS Preservation + +**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear. + +**Solution:** HTTPS preservation maintains secure protocols throughout crawling. + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy + +async def test_https_preservation(): + # Enable HTTPS preservation + url_filter = URLPatternFilter( + patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"] + ) + + config = CrawlerRunConfig( + exclude_external_links=True, + preserve_https_for_internal_links=True, # New in v0.7.5 + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + max_pages=5, + filter_chain=FilterChain([url_filter]) + ) + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun( + url="https://quotes.toscrape.com", + config=config + ): + # All internal links maintain HTTPS + internal_links = [link['href'] for link in result.links['internal']] + https_links = [link for link in internal_links if link.startswith('https://')] + + print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}") + for link in https_links[:3]: + print(f" → {link}") +``` + +## 🛠️ Bug Fixes and Improvements + +### Major Fixes +- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332) +- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated) +- **Docker Error Handling**: Comprehensive error messages with status codes +- **Memory Management**: Fixed leaks in long-running sessions +- **JWT Authentication**: Fixed Docker JWT validation issues (#1442) +- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481) +- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505) +- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419) +- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291) +- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989) + +### Community-Reported Issues Fixed +This release addresses multiple issues reported by the community through GitHub issues and Discord discussions: +- Fixed browser configuration reference errors +- Resolved dependency conflicts with cssselect +- Improved error messaging for failed authentications +- Enhanced compatibility with various proxy configurations +- Fixed edge cases in URL normalization + +### Configuration Updates +```python +# Old proxy config (deprecated) +# browser_config = BrowserConfig(proxy="http://proxy:8080") + +# New enhanced proxy config +browser_config = BrowserConfig( + proxy_config={ + "server": "http://proxy:8080", + "username": "optional-user", + "password": "optional-pass" + } +) +``` + +## 🔄 Breaking Changes + +1. **Python 3.10+ Required**: Upgrade from Python 3.9 +2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure +3. **New Dependency**: Added `cssselect` for better CSS handling + +## 🚀 Get Started + +```bash +# Install latest version +pip install crawl4ai==0.7.5 + +# Docker deployment +docker pull unclecode/crawl4ai:latest +docker run -p 11235:11235 unclecode/crawl4ai:latest +``` + +**Try the Demo:** +```bash +# Run working examples +python docs/releases_review/demo_v0.7.5.py +``` + +**Resources:** +- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com) +- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN) +- 🐦 Twitter: [@unclecode](https://x.com/unclecode) + +Happy crawling! 🕷️ diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md index 6eb6112b..cedd8e86 100644 --- a/docs/md_v2/blog/index.md +++ b/docs/md_v2/blog/index.md @@ -20,17 +20,26 @@ Ever wondered why your AI coding assistant struggles with your library despite c ## Latest Release +### [Crawl4AI v0.7.5 – The Docker Hooks & Security Update](../blog/release-v0.7.5.md) +*September 29, 2025* + +Crawl4AI v0.7.5 introduces the powerful Docker Hooks System for complete pipeline customization, enhanced LLM integration with custom providers, HTTPS preservation for modern web security, and resolves multiple community-reported issues. + +Key highlights: +- **🔧 Docker Hooks System**: Custom Python functions at 8 key pipeline points for unprecedented customization +- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration +- **🔒 HTTPS Preservation**: Secure internal link handling for modern web applications +- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance +- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration + +[Read full release notes →](../blog/release-v0.7.5.md) + +## Recent Releases + ### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md) *August 17, 2025* -Crawl4AI v0.7.4 introduces revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes that make Crawl4AI more robust for production workloads. - -Key highlights: -- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables -- **⚡ Dispatcher Bug Fix**: Fixed sequential processing issue in arun_many for fast-completing tasks -- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management -- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation -- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution +Revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes. [Read full release notes →](../blog/release-v0.7.4.md) diff --git a/docs/md_v2/blog/releases/v0.7.5.md b/docs/md_v2/blog/releases/v0.7.5.md new file mode 100644 index 00000000..5740873f --- /dev/null +++ b/docs/md_v2/blog/releases/v0.7.5.md @@ -0,0 +1,238 @@ +# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update + +*September 29, 2025 • 8 min read* + +--- + +Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements. + +## 🎯 What's New at a Glance + +- **Docker Hooks System**: Custom Python functions at key pipeline points +- **Enhanced LLM Integration**: Custom providers with temperature control +- **HTTPS Preservation**: Secure internal link handling +- **Bug Fixes**: Resolved multiple community-reported issues +- **Improved Docker Error Handling**: Better debugging and reliability + +## 🔧 Docker Hooks System: Pipeline Customization + +Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline. + +### Real Example: Authentication & Performance + +```python +import requests + +# Real working hooks for httpbin.org +hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print("Hook: Setting up page context") + # Block images to speed up crawling + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + print("Hook: Images blocked") + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print("Hook: Before retrieving HTML") + # Scroll to bottom to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + print("Hook: Scrolled to bottom") + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f"Hook: About to navigate to {url}") + # Add custom headers + await page.set_extra_http_headers({ + 'X-Test-Header': 'crawl4ai-hooks-test' + }) + return page +""" +} + +# Test with Docker API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": { + "code": hooks_config, + "timeout": 30 + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +result = response.json() + +if result.get('success'): + print("✅ Hooks executed successfully!") + print(f"Content length: {len(result.get('markdown', ''))} characters") +``` + +**Available Hook Points:** +- `on_browser_created`: Browser setup +- `on_page_context_created`: Page context configuration +- `before_goto`: Pre-navigation setup +- `after_goto`: Post-navigation processing +- `on_user_agent_updated`: User agent changes +- `on_execution_started`: Crawl initialization +- `before_retrieve_html`: Pre-extraction processing +- `before_return_html`: Final HTML processing + +## 🤖 Enhanced LLM Integration + +Enhanced LLM integration with custom providers, temperature control, and base URL configuration. + +### Multi-Provider Support + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +# Test with different providers +async def test_llm_providers(): + # OpenAI with custom temperature + openai_strategy = LLMExtractionStrategy( + provider="gemini/gemini-2.5-flash-lite", + api_token="your-api-token", + temperature=0.7, # New in v0.7.5 + instruction="Summarize this page in one sentence" + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://example.com", + config=CrawlerRunConfig(extraction_strategy=openai_strategy) + ) + + if result.success: + print("✅ LLM extraction completed") + print(result.extracted_content) + +# Docker API with enhanced LLM config +llm_payload = { + "url": "https://example.com", + "f": "llm", + "q": "Summarize this page in one sentence.", + "provider": "gemini/gemini-2.5-flash-lite", + "temperature": 0.7 +} + +response = requests.post("http://localhost:11235/md", json=llm_payload) +``` + +**New Features:** +- Custom `temperature` parameter for creativity control +- `base_url` for custom API endpoints +- Multi-provider environment variable support +- Docker API integration + +## 🔒 HTTPS Preservation + +**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear. + +**Solution:** HTTPS preservation maintains secure protocols throughout crawling. + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy + +async def test_https_preservation(): + # Enable HTTPS preservation + url_filter = URLPatternFilter( + patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"] + ) + + config = CrawlerRunConfig( + exclude_external_links=True, + preserve_https_for_internal_links=True, # New in v0.7.5 + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + max_pages=5, + filter_chain=FilterChain([url_filter]) + ) + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun( + url="https://quotes.toscrape.com", + config=config + ): + # All internal links maintain HTTPS + internal_links = [link['href'] for link in result.links['internal']] + https_links = [link for link in internal_links if link.startswith('https://')] + + print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}") + for link in https_links[:3]: + print(f" → {link}") +``` + +## 🛠️ Bug Fixes and Improvements + +### Major Fixes +- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332) +- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated) +- **Docker Error Handling**: Comprehensive error messages with status codes +- **Memory Management**: Fixed leaks in long-running sessions +- **JWT Authentication**: Fixed Docker JWT validation issues (#1442) +- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481) +- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505) +- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419) +- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291) +- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989) + +### Community-Reported Issues Fixed +This release addresses multiple issues reported by the community through GitHub issues and Discord discussions: +- Fixed browser configuration reference errors +- Resolved dependency conflicts with cssselect +- Improved error messaging for failed authentications +- Enhanced compatibility with various proxy configurations +- Fixed edge cases in URL normalization + +### Configuration Updates +```python +# Old proxy config (deprecated) +# browser_config = BrowserConfig(proxy="http://proxy:8080") + +# New enhanced proxy config +browser_config = BrowserConfig( + proxy_config={ + "server": "http://proxy:8080", + "username": "optional-user", + "password": "optional-pass" + } +) +``` + +## 🔄 Breaking Changes + +1. **Python 3.10+ Required**: Upgrade from Python 3.9 +2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure +3. **New Dependency**: Added `cssselect` for better CSS handling + +## 🚀 Get Started + +```bash +# Install latest version +pip install crawl4ai==0.7.5 + +# Docker deployment +docker pull unclecode/crawl4ai:latest +docker run -p 11235:11235 unclecode/crawl4ai:latest +``` + +**Try the Demo:** +```bash +# Run working examples +python docs/releases_review/demo_v0.7.5.py +``` + +**Resources:** +- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com) +- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN) +- 🐦 Twitter: [@unclecode](https://x.com/unclecode) + +Happy crawling! 🕷️ diff --git a/docs/releases_review/demo_v0.7.5.py b/docs/releases_review/demo_v0.7.5.py new file mode 100644 index 00000000..d25778ee --- /dev/null +++ b/docs/releases_review/demo_v0.7.5.py @@ -0,0 +1,309 @@ +""" +🚀 Crawl4AI v0.7.5 Release Demo - Working Examples +================================================== +This demo showcases key features introduced in v0.7.5 with real, executable examples. + +Featured Demos: +1. ✅ Docker Hooks System - Real API calls with custom hooks +2. ✅ Enhanced LLM Integration - Working LLM configurations +3. ✅ HTTPS Preservation - Live crawling with HTTPS maintenance + +Requirements: +- crawl4ai v0.7.5 installed +- Docker running with crawl4ai image (optional for Docker demos) +- Valid API keys for LLM demos (optional) +""" + +import asyncio +import requests +import time +import sys + +from crawl4ai import (AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, + CacheMode, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy) + + +def print_section(title: str, description: str = ""): + """Print a section header""" + print(f"\n{'=' * 60}") + print(f"{title}") + if description: + print(f"{description}") + print(f"{'=' * 60}\n") + + +async def demo_1_docker_hooks_system(): + """Demo 1: Docker Hooks System - Real API calls with custom hooks""" + print_section( + "Demo 1: Docker Hooks System", + "Testing real Docker hooks with live API calls" + ) + + # Check Docker service availability + def check_docker_service(): + try: + response = requests.get("http://localhost:11234/", timeout=3) + return response.status_code == 200 + except: + return False + + print("Checking Docker service...") + docker_running = check_docker_service() + + if not docker_running: + print("⚠️ Docker service not running on localhost:11235") + print("To test Docker hooks:") + print("1. Run: docker run -p 11235:11235 unclecode/crawl4ai:latest") + print("2. Wait for service to start") + print("3. Re-run this demo\n") + return + + print("✓ Docker service detected!") + + # Define real working hooks + hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print("Hook: Setting up page context") + # Block images to speed up crawling + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + print("Hook: Images blocked") + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print("Hook: Before retrieving HTML") + # Scroll to bottom to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + print("Hook: Scrolled to bottom") + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f"Hook: About to navigate to {url}") + # Add custom headers + await page.set_extra_http_headers({ + 'X-Test-Header': 'crawl4ai-hooks-test' + }) + return page +""" + } + + # Test with a reliable URL + test_url = "https://httpbin.org/html" + + payload = { + "urls": ["https://httpbin.org/html"], + "hooks": { + "code": hooks_config, + "timeout": 30 + } + } + + print(f"🎯 Testing URL: {test_url}") + print("🔧 Configured 3 hooks: on_page_context_created, before_retrieve_html, before_goto\n") + + # Make the request + print("🔄 Executing hooks...") + + try: + start_time = time.time() + response = requests.post( + "http://localhost:11234/crawl", + json=payload, + timeout=60 + ) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + + print(f"🎉 Success! Execution time: {execution_time:.2f}s\n") + + # Display results + success = result.get('success', False) + print(f"✅ Crawl Status: {'Success' if success else 'Failed'}") + + if success: + markdown_content = result.get('markdown', '') + print(f"📄 Content Length: {len(markdown_content)} characters") + + # Show content preview + if markdown_content: + preview = markdown_content[:300] + "..." if len(markdown_content) > 300 else markdown_content + print("\n--- Content Preview ---") + print(preview) + print("--- End Preview ---\n") + + # Check if our hook marker is present + raw_html = result.get('html', '') + if "Crawl4AI v0.7.5 Docker Hook" in raw_html: + print("✓ Hook marker found in HTML - hooks executed successfully!") + + # Display hook execution info if available + print("\nHook Execution Summary:") + print("🔗 before_goto: URL modified with tracking parameter") + print("✅ after_goto: Page navigation completed") + print("📝 before_return_html: Content processed and marked") + + else: + print(f"❌ Request failed: {response.status_code}") + try: + error_data = response.json() + print(f"Error: {error_data}") + except: + print(f"Raw response: {response.text[:500]}") + + except requests.exceptions.Timeout: + print("⏰ Request timed out after 60 seconds") + except Exception as e: + print(f"❌ Error: {str(e)}") + + +async def demo_2_enhanced_llm_integration(): + """Demo 2: Enhanced LLM Integration - Working LLM configurations""" + print_section( + "Demo 2: Enhanced LLM Integration", + "Testing custom LLM providers and configurations" + ) + + print("🤖 Testing Enhanced LLM Integration Features") + + provider = "gemini/gemini-2.5-flash-lite" + payload = { + "url": "https://example.com", + "f": "llm", + "q": "Summarize this page in one sentence.", + "provider": provider, # Explicitly set provider + "temperature": 0.7 + } + try: + response = requests.post( + "http://localhost:11234/md", + json=payload, + timeout=60 + ) + if response.status_code == 200: + result = response.json() + print(f"✓ Request successful with provider: {provider}") + print(f" - Response keys: {list(result.keys())}") + print(f" - Content length: {len(result.get('markdown', ''))} characters") + print(f" - Note: Actual LLM call may fail without valid API key") + else: + print(f"❌ Request failed: {response.status_code}") + print(f" - Response: {response.text[:500]}") + + except Exception as e: + print(f"[red]Error: {e}[/]") + + +async def demo_3_https_preservation(): + """Demo 3: HTTPS Preservation - Live crawling with HTTPS maintenance""" + print_section( + "Demo 3: HTTPS Preservation", + "Testing HTTPS preservation for internal links" + ) + + print("🔒 Testing HTTPS Preservation Feature") + + # Test with HTTPS preservation enabled + print("\nTest 1: HTTPS Preservation ENABLED") + + url_filter = URLPatternFilter( + patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"] + ) + config = CrawlerRunConfig( + exclude_external_links=True, + stream=True, + verbose=False, + preserve_https_for_internal_links=True, + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + max_pages=5, + filter_chain=FilterChain([url_filter]) + ) + ) + + test_url = "https://quotes.toscrape.com" + print(f"🎯 Testing URL: {test_url}") + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun(url=test_url, config=config): + print("✓ HTTPS Preservation Test Completed") + internal_links = [i['href'] for i in result.links['internal']] + for link in internal_links: + print(f" → {link}") + + +async def main(): + """Run all demos""" + print("\n" + "=" * 60) + print("🚀 Crawl4AI v0.7.5 Working Demo") + print("=" * 60) + + # Check system requirements + print("🔍 System Requirements Check:") + print(f" - Python version: {sys.version.split()[0]} {'✓' if sys.version_info >= (3, 10) else '❌ (3.10+ required)'}") + + try: + import requests + print(f" - Requests library: ✓") + except ImportError: + print(f" - Requests library: ❌") + + print() + + demos = [ + ("Docker Hooks System", demo_1_docker_hooks_system), + ("Enhanced LLM Integration", demo_2_enhanced_llm_integration), + ("HTTPS Preservation", demo_3_https_preservation), + ] + + for i, (name, demo_func) in enumerate(demos, 1): + try: + print(f"\n📍 Starting Demo {i}/{len(demos)}: {name}") + await demo_func() + + if i < len(demos): + print(f"\n✨ Demo {i} complete! Press Enter for next demo...") + input() + + except KeyboardInterrupt: + print(f"\n⏹️ Demo interrupted by user") + break + except Exception as e: + print(f"❌ Demo {i} error: {str(e)}") + print("Continuing to next demo...") + continue + + print("\n" + "=" * 60) + print("🎉 Demo Complete!") + print("=" * 60) + print("You've experienced the power of Crawl4AI v0.7.5!") + print("") + print("Key Features Demonstrated:") + print("🔧 Docker Hooks - Custom pipeline modifications") + print("🤖 Enhanced LLM - Better AI integration") + print("🔒 HTTPS Preservation - Secure link handling") + print("") + print("Ready to build something amazing? 🚀") + print("") + print("📖 Docs: https://docs.crawl4ai.com/") + print("🐙 GitHub: https://github.com/unclecode/crawl4ai") + print("=" * 60) + + +if __name__ == "__main__": + print("🚀 Crawl4AI v0.7.5 Live Demo Starting...") + print("Press Ctrl+C anytime to exit\n") + + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n👋 Demo stopped by user. Thanks for trying Crawl4AI v0.7.5!") + except Exception as e: + print(f"\n❌ Demo error: {str(e)}") + print("Make sure you have the required dependencies installed.")