From 8fc1747225a887efe9e86130b5e9aebb058f24fa Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 13 Oct 2025 13:59:34 +0800 Subject: [PATCH] docs: Add demonstration files for v0.7.5 release, showcasing the new Docker Hooks System and all other features. --- .../v0.7.5_docker_hooks_demo.py | 655 +++++++ .../v0.7.5_video_walkthrough.ipynb | 1516 +++++++++++++++++ 2 files changed, 2171 insertions(+) create mode 100644 docs/releases_review/v0.7.5_docker_hooks_demo.py create mode 100644 docs/releases_review/v0.7.5_video_walkthrough.ipynb diff --git a/docs/releases_review/v0.7.5_docker_hooks_demo.py b/docs/releases_review/v0.7.5_docker_hooks_demo.py new file mode 100644 index 00000000..9b4be0c2 --- /dev/null +++ b/docs/releases_review/v0.7.5_docker_hooks_demo.py @@ -0,0 +1,655 @@ +#!/usr/bin/env python3 +""" +πŸš€ Crawl4AI v0.7.5 - Docker Hooks System Complete Demonstration +================================================================ + +This file demonstrates the NEW Docker Hooks System introduced in v0.7.5. + +The Docker Hooks System is a completely NEW feature that provides pipeline +customization through user-provided Python functions. It offers three approaches: + +1. String-based hooks for REST API +2. hooks_to_string() utility to convert functions +3. Docker Client with automatic conversion (most convenient) + +All three approaches are part of this NEW v0.7.5 feature! + +Perfect for video recording and demonstration purposes. + +Requirements: +- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest +- crawl4ai v0.7.5 installed: pip install crawl4ai==0.7.5 +""" + +import asyncio +import requests +import json +import time +from typing import Dict, Any + +# Import Crawl4AI components +from crawl4ai import hooks_to_string +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Configuration +# DOCKER_URL = "http://localhost:11235" +DOCKER_URL = "http://localhost:11234" +TEST_URLS = [ + # "https://httpbin.org/html", + "https://www.kidocode.com", + "https://quotes.toscrape.com", +] + + +def print_section(title: str, description: str = ""): + """Print a formatted section header""" + print("\n" + "=" * 70) + print(f" {title}") + if description: + print(f" {description}") + print("=" * 70 + "\n") + + +def check_docker_service() -> bool: + """Check if Docker service is running""" + try: + response = requests.get(f"{DOCKER_URL}/health", timeout=3) + return response.status_code == 200 + except: + return False + + +# ============================================================================ +# REUSABLE HOOK LIBRARY (NEW in v0.7.5) +# ============================================================================ + +async def performance_optimization_hook(page, context, **kwargs): + """ + Performance Hook: Block unnecessary resources to speed up crawling + """ + print(" [Hook] πŸš€ Optimizing performance - blocking images and ads...") + + # Block images + await context.route( + "**/*.{png,jpg,jpeg,gif,webp,svg,ico}", + lambda route: route.abort() + ) + + # Block ads and analytics + await context.route("**/analytics/*", lambda route: route.abort()) + await context.route("**/ads/*", lambda route: route.abort()) + await context.route("**/google-analytics.com/*", lambda route: route.abort()) + + print(" [Hook] βœ“ Performance optimization applied") + return page + + +async def viewport_setup_hook(page, context, **kwargs): + """ + Viewport Hook: Set consistent viewport size for rendering + """ + print(" [Hook] πŸ–₯️ Setting viewport to 1920x1080...") + await page.set_viewport_size({"width": 1920, "height": 1080}) + print(" [Hook] βœ“ Viewport configured") + return page + + +async def authentication_headers_hook(page, context, url, **kwargs): + """ + Headers Hook: Add custom authentication and tracking headers + """ + print(f" [Hook] πŸ” Adding custom headers for {url[:50]}...") + + await page.set_extra_http_headers({ + 'X-Crawl4AI-Version': '0.7.5', + 'X-Custom-Hook': 'function-based-demo', + 'Accept-Language': 'en-US,en;q=0.9', + 'User-Agent': 'Crawl4AI/0.7.5 (Educational Demo)' + }) + + print(" [Hook] βœ“ Custom headers added") + return page + + +async def lazy_loading_handler_hook(page, context, **kwargs): + """ + Content Hook: Handle lazy-loaded content by scrolling + """ + print(" [Hook] πŸ“œ Scrolling to load lazy content...") + + # Scroll to bottom + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + + # Scroll to middle + await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") + await page.wait_for_timeout(500) + + # Scroll back to top + await page.evaluate("window.scrollTo(0, 0)") + await page.wait_for_timeout(500) + + print(" [Hook] βœ“ Lazy content loaded") + return page + + +async def page_analytics_hook(page, context, **kwargs): + """ + Analytics Hook: Log page metrics before extraction + """ + print(" [Hook] πŸ“Š Collecting page analytics...") + + metrics = await page.evaluate(''' + () => ({ + title: document.title, + images: document.images.length, + links: document.links.length, + scripts: document.scripts.length, + headings: document.querySelectorAll('h1, h2, h3').length, + paragraphs: document.querySelectorAll('p').length + }) + ''') + + print(f" [Hook] πŸ“ˆ Page: {metrics['title'][:50]}...") + print(f" Links: {metrics['links']}, Images: {metrics['images']}, " + f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}") + + return page + + +# ============================================================================ +# DEMO 1: String-Based Hooks (NEW Docker Hooks System) +# ============================================================================ + +def demo_1_string_based_hooks(): + """ + Demonstrate string-based hooks with REST API (part of NEW Docker Hooks System) + """ + print_section( + "DEMO 1: String-Based Hooks (REST API)", + "Part of the NEW Docker Hooks System - hooks as strings" + ) + + # Define hooks as strings + hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Setting up page context...") + # Block images for performance + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f" [String Hook] Navigating to {url[:50]}...") + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'string-based-hooks', + 'X-Demo': 'v0.7.5' + }) + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Scrolling page...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + return page +""" + } + + # Prepare request payload + payload = { + "urls": [TEST_URLS[0]], + "hooks": { + "code": hooks_config, + "timeout": 30 + }, + "crawler_config": { + "cache_mode": "bypass" + } + } + + print(f"🎯 Target URL: {TEST_URLS[0]}") + print(f"πŸ”§ Configured {len(hooks_config)} string-based hooks") + print(f"πŸ“‘ Sending request to Docker API...\n") + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + + print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") + + # Display results + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + html_length = len(crawl_result.get('html', '')) + markdown_length = len(crawl_result.get('markdown', '')) + + print(f"\nπŸ“Š Results:") + print(f" β€’ HTML length: {html_length:,} characters") + print(f" β€’ Markdown length: {markdown_length:,} characters") + print(f" β€’ URL: {crawl_result.get('url')}") + + # Check hooks execution + if 'hooks' in result: + hooks_info = result['hooks'] + print(f"\n🎣 Hooks Execution:") + print(f" β€’ Status: {hooks_info['status']['status']}") + print(f" β€’ Attached hooks: {len(hooks_info['status']['attached_hooks'])}") + + if 'summary' in hooks_info: + summary = hooks_info['summary'] + print(f" β€’ Total executions: {summary['total_executions']}") + print(f" β€’ Successful: {summary['successful']}") + print(f" β€’ Success rate: {summary['success_rate']:.1f}%") + else: + print(f"⚠️ Crawl completed but no results") + + else: + print(f"❌ Request failed with status {response.status_code}") + print(f" Error: {response.text[:200]}") + + except requests.exceptions.Timeout: + print("⏰ Request timed out after 60 seconds") + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\n" + "─" * 70) + print("βœ“ String-based hooks demo complete\n") + + +# ============================================================================ +# DEMO 2: Function-Based Hooks with hooks_to_string() Utility +# ============================================================================ + +def demo_2_hooks_to_string_utility(): + """ + Demonstrate the new hooks_to_string() utility for converting functions + """ + print_section( + "DEMO 2: hooks_to_string() Utility (NEW! ✨)", + "Convert Python functions to strings for REST API" + ) + + print("πŸ“¦ Creating hook functions...") + print(" β€’ performance_optimization_hook") + print(" β€’ viewport_setup_hook") + print(" β€’ authentication_headers_hook") + print(" β€’ lazy_loading_handler_hook") + + # Convert function objects to strings using the NEW utility + print("\nπŸ”„ Converting functions to strings with hooks_to_string()...") + + hooks_dict = { + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + } + + hooks_as_strings = hooks_to_string(hooks_dict) + + print(f"βœ… Successfully converted {len(hooks_as_strings)} functions to strings") + + # Show a preview + print("\nπŸ“ Sample converted hook (first 250 characters):") + print("─" * 70) + sample_hook = list(hooks_as_strings.values())[0] + print(sample_hook[:250] + "...") + print("─" * 70) + + # Use the converted hooks with REST API + print("\nπŸ“‘ Using converted hooks with REST API...") + + payload = { + "urls": [TEST_URLS[0]], + "hooks": { + "code": hooks_as_strings, + "timeout": 30 + } + } + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") + + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + print(f" β€’ HTML length: {len(crawl_result.get('html', '')):,} characters") + print(f" β€’ Hooks executed successfully!") + else: + print(f"❌ Request failed: {response.status_code}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\nπŸ’‘ Benefits of hooks_to_string():") + print(" βœ“ Write hooks as regular Python functions") + print(" βœ“ Full IDE support (autocomplete, syntax highlighting)") + print(" βœ“ Type checking and linting") + print(" βœ“ Easy to test and debug") + print(" βœ“ Reusable across projects") + print(" βœ“ Works with any REST API client") + + print("\n" + "─" * 70) + print("βœ“ hooks_to_string() utility demo complete\n") + + +# ============================================================================ +# DEMO 3: Docker Client with Automatic Conversion (RECOMMENDED! 🌟) +# ============================================================================ + +async def demo_3_docker_client_auto_conversion(): + """ + Demonstrate Docker Client with automatic hook conversion (RECOMMENDED) + """ + print_section( + "DEMO 3: Docker Client with Auto-Conversion (RECOMMENDED! 🌟)", + "Pass function objects directly - conversion happens automatically!" + ) + + print("🐳 Initializing Crawl4AI Docker Client...") + client = Crawl4aiDockerClient(base_url=DOCKER_URL) + + print("βœ… Client ready!\n") + + # Use our reusable hook library - just pass the function objects! + print("πŸ“š Using reusable hook library:") + print(" β€’ performance_optimization_hook") + print(" β€’ viewport_setup_hook") + print(" β€’ authentication_headers_hook") + print(" β€’ lazy_loading_handler_hook") + print(" β€’ page_analytics_hook") + + print("\n🎯 Target URL: " + TEST_URLS[1]) + print("πŸš€ Starting crawl with automatic hook conversion...\n") + + try: + start_time = time.time() + + # Pass function objects directly - NO manual conversion needed! ✨ + results = await client.crawl( + urls=[TEST_URLS[0]], + hooks={ + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + "before_return_html": page_analytics_hook, + }, + hooks_timeout=30 + ) + + execution_time = time.time() - start_time + + print(f"\nβœ… Crawl completed! (took {execution_time:.2f}s)\n") + + # Display results + if results and results.success: + result = results + print(f"πŸ“Š Results:") + print(f" β€’ URL: {result.url}") + print(f" β€’ Success: {result.success}") + print(f" β€’ HTML length: {len(result.html):,} characters") + print(f" β€’ Markdown length: {len(result.markdown):,} characters") + + # Show metadata + if result.metadata: + print(f"\nπŸ“‹ Metadata:") + print(f" β€’ Title: {result.metadata.get('title', 'N/A')}") + print(f" β€’ Description: {result.metadata.get('description', 'N/A')}") + + # Show links + if result.links: + internal_count = len(result.links.get('internal', [])) + external_count = len(result.links.get('external', [])) + print(f"\nπŸ”— Links Found:") + print(f" β€’ Internal: {internal_count}") + print(f" β€’ External: {external_count}") + else: + print(f"⚠️ Crawl completed but no successful results") + if results: + print(f" Error: {results.error_message}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + import traceback + traceback.print_exc() + + print("\n🌟 Why Docker Client is RECOMMENDED:") + print(" βœ“ Automatic function-to-string conversion") + print(" βœ“ No manual hooks_to_string() calls needed") + print(" βœ“ Cleaner, more Pythonic code") + print(" βœ“ Full type hints and IDE support") + print(" βœ“ Built-in error handling") + print(" βœ“ Async/await support") + + print("\n" + "─" * 70) + print("βœ“ Docker Client auto-conversion demo complete\n") + + +# ============================================================================ +# DEMO 4: Advanced Use Case - Complete Hook Pipeline +# ============================================================================ + +async def demo_4_complete_hook_pipeline(): + """ + Demonstrate a complete hook pipeline using all 8 hook points + """ + print_section( + "DEMO 4: Complete Hook Pipeline", + "Using all 8 available hook points for comprehensive control" + ) + + # Define all 8 hooks + async def on_browser_created_hook(browser, **kwargs): + """Hook 1: Called after browser is created""" + print(" [Pipeline] 1/8 Browser created") + return browser + + async def on_page_context_created_hook(page, context, **kwargs): + """Hook 2: Called after page context is created""" + print(" [Pipeline] 2/8 Page context created - setting up...") + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + async def on_user_agent_updated_hook(page, context, user_agent, **kwargs): + """Hook 3: Called when user agent is updated""" + print(f" [Pipeline] 3/8 User agent updated: {user_agent[:50]}...") + return page + + async def before_goto_hook(page, context, url, **kwargs): + """Hook 4: Called before navigating to URL""" + print(f" [Pipeline] 4/8 Before navigation to: {url[:60]}...") + return page + + async def after_goto_hook(page, context, url, response, **kwargs): + """Hook 5: Called after navigation completes""" + print(f" [Pipeline] 5/8 After navigation - Status: {response.status if response else 'N/A'}") + await page.wait_for_timeout(1000) + return page + + async def on_execution_started_hook(page, context, **kwargs): + """Hook 6: Called when JavaScript execution starts""" + print(" [Pipeline] 6/8 JavaScript execution started") + return page + + async def before_retrieve_html_hook(page, context, **kwargs): + """Hook 7: Called before retrieving HTML""" + print(" [Pipeline] 7/8 Before HTML retrieval - scrolling...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + return page + + async def before_return_html_hook(page, context, html, **kwargs): + """Hook 8: Called before returning HTML""" + print(f" [Pipeline] 8/8 Before return - HTML length: {len(html):,} chars") + return page + + print("🎯 Target URL: " + TEST_URLS[0]) + print("πŸ”§ Configured ALL 8 hook points for complete pipeline control\n") + + client = Crawl4aiDockerClient(base_url=DOCKER_URL) + + try: + print("πŸš€ Starting complete pipeline crawl...\n") + start_time = time.time() + + results = await client.crawl( + urls=[TEST_URLS[0]], + hooks={ + "on_browser_created": on_browser_created_hook, + "on_page_context_created": on_page_context_created_hook, + "on_user_agent_updated": on_user_agent_updated_hook, + "before_goto": before_goto_hook, + "after_goto": after_goto_hook, + "on_execution_started": on_execution_started_hook, + "before_retrieve_html": before_retrieve_html_hook, + "before_return_html": before_return_html_hook, + }, + hooks_timeout=45 + ) + + execution_time = time.time() - start_time + + if results and results.success: + print(f"\nβœ… Complete pipeline executed successfully! (took {execution_time:.2f}s)") + print(f" β€’ All 8 hooks executed in sequence") + print(f" β€’ HTML length: {len(results.html):,} characters") + else: + print(f"⚠️ Pipeline completed with warnings") + + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\nπŸ“š Available Hook Points:") + print(" 1. on_browser_created - Browser initialization") + print(" 2. on_page_context_created - Page context setup") + print(" 3. on_user_agent_updated - User agent configuration") + print(" 4. before_goto - Pre-navigation setup") + print(" 5. after_goto - Post-navigation processing") + print(" 6. on_execution_started - JavaScript execution start") + print(" 7. before_retrieve_html - Pre-extraction processing") + print(" 8. before_return_html - Final HTML processing") + + print("\n" + "─" * 70) + print("βœ“ Complete hook pipeline demo complete\n") + + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +async def main(): + """ + Run all demonstrations + """ + print("\n" + "=" * 70) + print(" πŸš€ Crawl4AI v0.7.5 - Docker Hooks Complete Demonstration") + print("=" * 70) + + # Check Docker service + print("\nπŸ” Checking Docker service status...") + if not check_docker_service(): + print("❌ Docker service is not running!") + print("\nπŸ“‹ To start the Docker service:") + print(" docker run -p 11235:11235 unclecode/crawl4ai:latest") + print("\nPlease start the service and run this demo again.") + return + + print("βœ… Docker service is running!\n") + + # Run all demos + demos = [ + ("String-Based Hooks (REST API)", demo_1_string_based_hooks, False), + ("hooks_to_string() Utility", demo_2_hooks_to_string_utility, False), + ("Docker Client Auto-Conversion", demo_3_docker_client_auto_conversion, True), + ("Complete Hook Pipeline", demo_4_complete_hook_pipeline, True), + ] + + for i, (name, demo_func, is_async) in enumerate(demos, 1): + print(f"\n{'πŸ”·' * 35}") + print(f"Starting Demo {i}/{len(demos)}: {name}") + print(f"{'πŸ”·' * 35}\n") + + try: + if is_async: + await demo_func() + else: + demo_func() + + print(f"βœ… Demo {i} completed successfully!") + + # Pause between demos (except the last one) + if i < len(demos): + print("\n⏸️ Press Enter to continue to next demo...") + input() + + except KeyboardInterrupt: + print(f"\n⏹️ Demo interrupted by user") + break + except Exception as e: + print(f"\n❌ Demo {i} failed: {str(e)}") + import traceback + traceback.print_exc() + print("\nContinuing to next demo...\n") + continue + + # Final summary + # print("\n" + "=" * 70) + # print(" πŸŽ‰ All Demonstrations Complete!") + # print("=" * 70) + + # print("\nπŸ“Š Summary of v0.7.5 Docker Hooks System:") + # print("\nπŸ†• COMPLETELY NEW FEATURE in v0.7.5:") + # print(" The Docker Hooks System lets you customize the crawling pipeline") + # print(" with user-provided Python functions at 8 strategic points.") + + # print("\n✨ Three Ways to Use Docker Hooks (All NEW!):") + # print(" 1. String-based - Write hooks as strings for REST API") + # print(" 2. hooks_to_string() - Convert Python functions to strings") + # print(" 3. Docker Client - Automatic conversion (RECOMMENDED)") + + # print("\nπŸ’‘ Key Benefits:") + # print(" βœ“ Full IDE support (autocomplete, syntax highlighting)") + # print(" βœ“ Type checking and linting") + # print(" βœ“ Easy to test and debug") + # print(" βœ“ Reusable across projects") + # print(" βœ“ Complete pipeline control") + + # print("\n🎯 8 Hook Points Available:") + # print(" β€’ on_browser_created, on_page_context_created") + # print(" β€’ on_user_agent_updated, before_goto, after_goto") + # print(" β€’ on_execution_started, before_retrieve_html, before_return_html") + + # print("\nπŸ“š Resources:") + # print(" β€’ Docs: https://docs.crawl4ai.com") + # print(" β€’ GitHub: https://github.com/unclecode/crawl4ai") + # print(" β€’ Discord: https://discord.gg/jP8KfhDhyN") + + # print("\n" + "=" * 70) + # print(" Happy Crawling with v0.7.5! πŸ•·οΈ") + # print("=" * 70 + "\n") + + +if __name__ == "__main__": + print("\n🎬 Starting Crawl4AI v0.7.5 Docker Hooks Demonstration...") + print("Press Ctrl+C anytime to exit\n") + + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n\nπŸ‘‹ Demo stopped by user. Thanks for exploring Crawl4AI v0.7.5!") + except Exception as e: + print(f"\n\n❌ Demo error: {str(e)}") + import traceback + traceback.print_exc() diff --git a/docs/releases_review/v0.7.5_video_walkthrough.ipynb b/docs/releases_review/v0.7.5_video_walkthrough.ipynb new file mode 100644 index 00000000..a57de4c9 --- /dev/null +++ b/docs/releases_review/v0.7.5_video_walkthrough.ipynb @@ -0,0 +1,1516 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# πŸš€ Crawl4AI v0.7.5 - Complete Feature Walkthrough\n", + "\n", + "Welcome to Crawl4AI v0.7.5! This notebook demonstrates all the new features introduced in this release.\n", + "\n", + "## πŸ“‹ What's New in v0.7.5\n", + "\n", + "1. **πŸ”§ Docker Hooks System** - NEW! Complete pipeline customization with user-provided Python functions\n", + "2. **πŸ€– Enhanced LLM Integration** - Custom providers with temperature control\n", + "3. **πŸ”’ HTTPS Preservation** - Secure internal link handling\n", + "4. **πŸ› οΈ Multiple Bug Fixes** - Community-reported issues resolved\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## πŸ“¦ Setup and Installation\n", + "\n", + "First, let's make sure we have the latest version installed:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Crawl4AI v0.7.5 ready!\n" + ] + } + ], + "source": [ + "# # Install or upgrade to v0.7.5\n", + "# !pip install -U crawl4ai==0.7.5 --quiet\n", + "\n", + "# Import required modules\n", + "import asyncio\n", + "import nest_asyncio\n", + "nest_asyncio.apply() # For Jupyter compatibility\n", + "\n", + "from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode\n", + "from crawl4ai import FilterChain, URLPatternFilter, BFSDeepCrawlStrategy\n", + "from crawl4ai import hooks_to_string\n", + "\n", + "print(\"βœ… Crawl4AI v0.7.5 ready!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## πŸ”’ Feature 1: HTTPS Preservation for Internal Links\n", + "\n", + "### Problem\n", + "When crawling HTTPS sites, internal links sometimes get downgraded to HTTP, breaking authentication and causing security warnings.\n", + "\n", + "### Solution \n", + "The new `preserve_https_for_internal_links=True` parameter maintains HTTPS protocol for all internal links." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ”’ Testing HTTPS Preservation\n", + "\n", + "============================================================\n" + ] + }, + { + "data": { + "text/html": [ + "
[INIT].... β†’ Crawl4AI 0.7.5 \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;36m[\u001b[0m\u001b[36mINIT\u001b[0m\u001b[1;36m]\u001b[0m\u001b[36m...\u001b[0m\u001b[36m. β†’ Crawl4AI \u001b[0m\u001b[1;36m0.7\u001b[0m\u001b[36m.\u001b[0m\u001b[1;36m5\u001b[0m\u001b[36m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[FETCH]... ↓ https://quotes.toscrape.com                                                                          |\n",
+       "βœ“ | ⏱: 1.98s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mFETCH\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m...\u001b[0m\u001b[32m ↓ \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m1.\u001b[0m\u001b[32m98s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[SCRAPE].. β—† https://quotes.toscrape.com                                                                          |\n",
+       "βœ“ | ⏱: 0.01s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mSCRAPE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m.. β—† \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m01s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[COMPLETE] ● https://quotes.toscrape.com                                                                          |\n",
+       "βœ“ | ⏱: 2.00s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mCOMPLETE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m ● \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m2.\u001b[0m\u001b[32m00s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[FETCH]... ↓ https://quotes.toscrape.com                                                                          |\n",
+       "βœ“ | ⏱: 0.72s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mFETCH\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m...\u001b[0m\u001b[32m ↓ \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m72s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[SCRAPE].. β—† https://quotes.toscrape.com                                                                          |\n",
+       "βœ“ | ⏱: 0.01s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mSCRAPE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m.. β—† \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m01s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[COMPLETE] ● https://quotes.toscrape.com                                                                          |\n",
+       "βœ“ | ⏱: 0.73s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mCOMPLETE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m ● \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m73s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[FETCH]... ↓ https://quotes.toscrape.com/login                                                                    |\n",
+       "βœ“ | ⏱: 0.83s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mFETCH\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m...\u001b[0m\u001b[32m ↓ \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/login\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m83s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[SCRAPE].. β—† https://quotes.toscrape.com/login                                                                    |\n",
+       "βœ“ | ⏱: 0.00s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mSCRAPE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m.. β—† \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/login\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m00s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[COMPLETE] ● https://quotes.toscrape.com/login                                                                    |\n",
+       "βœ“ | ⏱: 0.83s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mCOMPLETE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m ● \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/login\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m83s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[FETCH]... ↓ https://quotes.toscrape.com/tag/change/page/1                                                        |\n",
+       "βœ“ | ⏱: 1.11s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mFETCH\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m...\u001b[0m\u001b[32m ↓ \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/tag/change/page/1\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m1.\u001b[0m\u001b[32m11s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[SCRAPE].. β—† https://quotes.toscrape.com/tag/change/page/1                                                        |\n",
+       "βœ“ | ⏱: 0.00s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mSCRAPE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m.. β—† \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/tag/change/page/1\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m00s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[COMPLETE] ● https://quotes.toscrape.com/tag/change/page/1                                                        |\n",
+       "βœ“ | ⏱: 1.12s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mCOMPLETE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m ● \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/tag/change/page/1\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m1.\u001b[0m\u001b[32m12s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[FETCH]... ↓ https://quotes.toscrape.com/author/Albert-Einstein                                                   |\n",
+       "βœ“ | ⏱: 1.32s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mFETCH\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m...\u001b[0m\u001b[32m ↓ \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/author/Albert-Einstein\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m1.\u001b[0m\u001b[32m32s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[SCRAPE].. β—† https://quotes.toscrape.com/author/Albert-Einstein                                                   |\n",
+       "βœ“ | ⏱: 0.00s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mSCRAPE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m.. β—† \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/author/Albert-Einstein\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m00s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[COMPLETE] ● https://quotes.toscrape.com/author/Albert-Einstein                                                   |\n",
+       "βœ“ | ⏱: 1.33s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mCOMPLETE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m ● \u001b[0m\u001b[4;32mhttps://quotes.toscrape.com/author/Albert-Einstein\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m1.\u001b[0m\u001b[32m33s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "πŸ“Š Results:\n", + " Pages crawled: 5\n", + " Total internal links (from first page): 47\n", + " HTTPS links: 47 βœ…\n", + " HTTP links: 0 \n", + " HTTPS preservation rate: 100.0%\n", + "\n", + "πŸ”— Sample HTTPS-preserved links:\n", + " β†’ https://quotes.toscrape.com/\n", + " β†’ https://quotes.toscrape.com/login\n", + " β†’ https://quotes.toscrape.com/author/Albert-Einstein\n", + " β†’ https://quotes.toscrape.com/tag/change/page/1\n", + " β†’ https://quotes.toscrape.com/tag/deep-thoughts/page/1\n", + "\n", + "============================================================\n", + "βœ… HTTPS Preservation Demo Complete!\n", + "\n" + ] + } + ], + "source": [ + "async def demo_https_preservation():\n", + " \"\"\"\n", + " Demonstrate HTTPS preservation with deep crawling\n", + " \"\"\"\n", + " print(\"πŸ”’ Testing HTTPS Preservation\\n\")\n", + " print(\"=\" * 60)\n", + " \n", + " # Setup URL filter for quotes.toscrape.com\n", + " url_filter = URLPatternFilter(\n", + " patterns=[r\"^(https:\\/\\/)?quotes\\.toscrape\\.com(\\/.*)?$\"]\n", + " )\n", + " \n", + " # Configure crawler with HTTPS preservation\n", + " config = CrawlerRunConfig(\n", + " exclude_external_links=True,\n", + " preserve_https_for_internal_links=True, # πŸ†• NEW in v0.7.5\n", + " cache_mode=CacheMode.BYPASS,\n", + " deep_crawl_strategy=BFSDeepCrawlStrategy(\n", + " max_depth=2,\n", + " max_pages=5,\n", + " filter_chain=FilterChain([url_filter])\n", + " )\n", + " )\n", + " \n", + " async with AsyncWebCrawler() as crawler:\n", + " # With deep_crawl_strategy, arun() returns a list of CrawlResult objects\n", + " results = await crawler.arun(\n", + " url=\"https://quotes.toscrape.com\",\n", + " config=config\n", + " )\n", + " \n", + " # Analyze the first result\n", + " if results and len(results) > 0:\n", + " first_result = results[0]\n", + " internal_links = [link['href'] for link in first_result.links['internal']]\n", + " \n", + " # Check HTTPS preservation\n", + " https_links = [link for link in internal_links if link.startswith('https://')]\n", + " http_links = [link for link in internal_links if link.startswith('http://') and not link.startswith('https://')]\n", + " \n", + " print(f\"\\nπŸ“Š Results:\")\n", + " print(f\" Pages crawled: {len(results)}\")\n", + " print(f\" Total internal links (from first page): {len(internal_links)}\")\n", + " print(f\" HTTPS links: {len(https_links)} βœ…\")\n", + " print(f\" HTTP links: {len(http_links)} {'⚠️' if http_links else ''}\")\n", + " if internal_links:\n", + " print(f\" HTTPS preservation rate: {len(https_links)/len(internal_links)*100:.1f}%\")\n", + " \n", + " print(f\"\\nπŸ”— Sample HTTPS-preserved links:\")\n", + " for link in https_links[:5]:\n", + " print(f\" β†’ {link}\")\n", + " else:\n", + " print(f\"\\n⚠️ No results returned\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"βœ… HTTPS Preservation Demo Complete!\\n\")\n", + "\n", + "# Run the demo\n", + "await demo_https_preservation()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## πŸ€– Feature 2: Enhanced LLM Integration\n", + "\n", + "### What's New\n", + "- Custom `temperature` parameter for creativity control\n", + "- `base_url` for custom API endpoints\n", + "- Better multi-provider support\n", + "\n", + "### Example with Custom Temperature" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ€– Testing Enhanced LLM Integration\n", + "\n", + "============================================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/k0/7502j87n0_q4f9g82c0w8ks80000gn/T/ipykernel_15029/173393508.py:47: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/\n", + " schema=Article.schema(),\n" + ] + }, + { + "data": { + "text/html": [ + "
[INIT].... β†’ Crawl4AI 0.7.5 \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;36m[\u001b[0m\u001b[36mINIT\u001b[0m\u001b[1;36m]\u001b[0m\u001b[36m...\u001b[0m\u001b[36m. β†’ Crawl4AI \u001b[0m\u001b[1;36m0.7\u001b[0m\u001b[36m.\u001b[0m\u001b[1;36m5\u001b[0m\u001b[36m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[FETCH]... ↓ https://en.wikipedia.org/wiki/Artificial_intelligence                                                |\n",
+       "βœ“ | ⏱: 3.05s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mFETCH\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m...\u001b[0m\u001b[32m ↓ \u001b[0m\u001b[4;32mhttps://en.wikipedia.org/wiki/Artificial_intelligence\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m3.\u001b[0m\u001b[32m05s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[SCRAPE].. β—† https://en.wikipedia.org/wiki/Artificial_intelligence                                                |\n",
+       "βœ“ | ⏱: 0.63s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mSCRAPE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m.. β—† \u001b[0m\u001b[4;32mhttps://en.wikipedia.org/wiki/Artificial_intelligence\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m63s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[EXTRACT]. β–  https://en.wikipedia.org/wiki/Artificial_intelligence                                                |\n",
+       "βœ“ | ⏱: 20.74s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mEXTRACT\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m. β–  \u001b[0m\u001b[4;32mhttps://en.wikipedia.org/wiki/Artificial_intelligence\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m20.\u001b[0m\u001b[32m74s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[COMPLETE] ● https://en.wikipedia.org/wiki/Artificial_intelligence                                                |\n",
+       "βœ“ | ⏱: 24.42s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mCOMPLETE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m ● \u001b[0m\u001b[4;32mhttps://en.wikipedia.org/wiki/Artificial_intelligence\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m24.\u001b[0m\u001b[32m42s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "βœ… LLM Extraction Successful!\n", + "\n", + "πŸ“„ Extracted Content:\n", + "[\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think and learn like humans. AI can be applied in various fields and has numerous applications, including health, finance, and military.\",\n", + " \"main_topics\": [\n", + " \"Goals\",\n", + " \"Techniques\",\n", + " \"Applications\",\n", + " \"Ethics\",\n", + " \"History\",\n", + " \"Philosophy\",\n", + " \"Future\",\n", + " \"In fiction\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"The article discusses artificial intelligence (AI), its various techniques, applications, and advancements, particularly focusing on machine learning, deep learning, and neural networks. It highlights the evolution of AI technologies, including generative pre-trained transformers (GPT), and their impact on fields such as healthcare, gaming, and mathematics.\",\n", + " \"main_topics\": [\n", + " \"Classifiers and pattern matching\",\n", + " \"Artificial neural networks\",\n", + " \"Deep learning\",\n", + " \"Generative pre-trained transformers (GPT)\",\n", + " \"Hardware and software for AI\",\n", + " \"Applications of AI\",\n", + " \"AI in healthcare\",\n", + " \"AI in games\",\n", + " \"AI in mathematics\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"Artificial intelligence (AI) is the capability of computational systems to perform tasks typically associated with human intelligence, such as learning, reasoning, problem-solving, perception, and decision-making. It is a field of research in computer science that develops methods and software enabling machines to perceive their environment and take actions to achieve defined goals. AI has seen significant advancements and applications in various domains, including web search engines, recommendation systems, virtual assistants, and autonomous vehicles, among others.\",\n", + " \"main_topics\": [\n", + " \"Goals\",\n", + " \"Reasoning and problem-solving\",\n", + " \"Knowledge representation\",\n", + " \"Planning and decision-making\",\n", + " \"Learning\",\n", + " \"Applications\",\n", + " \"Philosophy\",\n", + " \"History\",\n", + " \"Controversies\",\n", + " \"Ethics\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"The article discusses artificial intelligence (AI), its various techniques, and applications. It covers the foundational concepts of AI, including machine learning, natural language processing, perception, social intelligence, and general intelligence. The article also highlights the methods used in AI research, such as search and optimization, logic, probabilistic methods, and classifiers.\",\n", + " \"main_topics\": [\n", + " \"Markov decision processes\",\n", + " \"Machine learning\",\n", + " \"Supervised learning\",\n", + " \"Unsupervised learning\",\n", + " \"Reinforcement learning\",\n", + " \"Transfer learning\",\n", + " \"Deep learning\",\n", + " \"Natural language processing\",\n", + " \"Machine perception\",\n", + " \"Social intelligence\",\n", + " \"Artificial general intelligence\",\n", + " \"Search and optimization\",\n", + " \"Logic\",\n", + " \"Probabilistic methods\",\n", + " \"Classifiers and statistical learning methods\",\n", + " \"Artificial neural networks\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the complexities and challenges associated with artificial intelligence (AI), particularly focusing on issues of bias, fairness, transparency, and the potential risks posed by AI technologies. It highlights the ethical implications of AI systems, the lack of diversity among AI developers, and the existential risks associated with advanced AI.\",\n", + " \"main_topics\": [\n", + " \"Bias and fairness in AI\",\n", + " \"Lack of transparency in AI systems\",\n", + " \"Weaponization of AI\",\n", + " \"Technological unemployment due to AI\",\n", + " \"Existential risk from AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"The article discusses the advancements and applications of artificial intelligence (AI) across various fields, including mathematics, finance, military, generative AI, and more. It highlights the capabilities of AI models, their limitations, and the ethical considerations surrounding their use.\",\n", + " \"main_topics\": [\n", + " \"Mathematics\",\n", + " \"Finance\",\n", + " \"Military applications\",\n", + " \"Generative AI\",\n", + " \"AI agents\",\n", + " \"Web search\",\n", + " \"Sexuality\",\n", + " \"Industry-specific tasks\",\n", + " \"Ethics\",\n", + " \"Privacy and copyright\",\n", + " \"Dominance by tech giants\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses various aspects of artificial intelligence (AI), including its impact on privacy, copyright issues, environmental concerns, misinformation, and algorithmic bias. It highlights the dominance of big tech companies in the AI landscape and the increasing power demands of AI technologies.\",\n", + " \"main_topics\": [\n", + " \"Privacy and Fairness\",\n", + " \"Generative AI and Copyright\",\n", + " \"Dominance by Tech Giants\",\n", + " \"Power Needs and Environmental Impacts\",\n", + " \"Misinformation\",\n", + " \"Algorithmic Bias and Fairness\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"The article discusses the mixed opinions among experts regarding the risks associated with artificial intelligence (AI), particularly concerning superintelligent AI. It highlights concerns from notable figures in the field about existential risks, the importance of establishing safety guidelines, and the ongoing debate between pessimistic and optimistic views on AI's future impact. The article also covers ethical considerations, open-source developments, regulatory efforts, and the historical context of AI research.\",\n", + " \"main_topics\": [\n", + " \"Expert opinions on AI risks\",\n", + " \"Existential risk from superintelligent AI\",\n", + " \"Ethical machines and alignment\",\n", + " \"Open-source AI\",\n", + " \"Regulation of artificial intelligence\",\n", + " \"History of artificial intelligence\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the history, development, and various approaches to artificial intelligence (AI), highlighting key milestones, challenges, and philosophical debates surrounding the field. It covers the evolution from early optimism and funding cuts to the resurgence of interest through expert systems and deep learning, as well as the implications of AI advancements on society.\",\n", + " \"main_topics\": [\n", + " \"History of AI\",\n", + " \"AI winter\",\n", + " \"Expert systems\",\n", + " \"Deep learning\",\n", + " \"Artificial general intelligence (AGI)\",\n", + " \"Philosophy of AI\",\n", + " \"Defining artificial intelligence\",\n", + " \"Evaluating approaches to AI\",\n", + " \"Symbolic AI vs. sub-symbolic AI\",\n", + " \"Narrow AI vs. general AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. It encompasses various subfields including machine learning, natural language processing, and robotics, and aims to create systems that can perform tasks that typically require human intelligence.\",\n", + " \"main_topics\": [\n", + " \"Organoid intelligence\",\n", + " \"Robotic process automation\",\n", + " \"Wetware computer\",\n", + " \"DARWIN EU\",\n", + " \"Artificial intelligence in Wikimedia projects\",\n", + " \"AI-generated content on Wikipedia\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"The article discusses the field of artificial intelligence (AI), exploring its various branches, methodologies, and philosophical implications. It highlights the ongoing debates within the AI community regarding the pursuit of general versus narrow AI, the nature of consciousness in machines, and the ethical considerations surrounding AI rights and welfare.\",\n", + " \"main_topics\": [\n", + " \"Soft vs. hard computing\",\n", + " \"Narrow vs. general AI\",\n", + " \"Philosophy of artificial intelligence\",\n", + " \"Consciousness\",\n", + " \"Computationalism and functionalism\",\n", + " \"AI welfare and rights\",\n", + " \"Superintelligence and the singularity\",\n", + " \"Transhumanism\",\n", + " \"Artificial intelligence in fiction\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the field of artificial intelligence (AI), covering its definitions, history, methodologies, and applications. It explores various aspects of AI, including machine learning, natural language processing, and robotics, as well as the challenges and ethical considerations associated with AI technologies.\",\n", + " \"main_topics\": [\n", + " \"Definitions of AI\",\n", + " \"History of AI\",\n", + " \"Machine Learning\",\n", + " \"Natural Language Processing\",\n", + " \"Robotics\",\n", + " \"Ethical Considerations\",\n", + " \"Applications of AI\",\n", + " \"AI Methodologies\",\n", + " \"Challenges in AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the advancements and implications of artificial intelligence (AI), particularly focusing on generative AI and its impact across various sectors including healthcare, finance, entertainment, and environmental concerns.\",\n", + " \"main_topics\": [\n", + " \"Generative AI in software development\",\n", + " \"AI in healthcare\",\n", + " \"AI in financial services\",\n", + " \"Impact of AI on Hollywood and entertainment\",\n", + " \"AI and environmental issues\",\n", + " \"AI's role in creativity\",\n", + " \"AI in search technologies\",\n", + " \"AI's energy consumption\",\n", + " \"AI and societal implications\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the concept of artificial intelligence (AI), its development, applications, and the ethical implications surrounding its use. It highlights the advancements in AI technology, including synthetic media and computational capitalism, and addresses concerns regarding misinformation and media manipulation through AI tools.\",\n", + " \"main_topics\": [\n", + " \"Definition of Artificial Intelligence\",\n", + " \"Advancements in AI technology\",\n", + " \"Synthetic media and computational capitalism\",\n", + " \"Ethical implications of AI\",\n", + " \"Misinformation and media manipulation\",\n", + " \"AI in surveillance and security\",\n", + " \"AI's impact on employment\",\n", + " \"Global regulatory frameworks for AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the concept of artificial intelligence (AI), its applications, advancements, and implications across various fields, including healthcare, programming, and national security. It highlights the evolution of AI technologies, notable achievements, and the ongoing debates surrounding ethical considerations and the future of AI.\",\n", + " \"main_topics\": [\n", + " \"Definition of Artificial Intelligence\",\n", + " \"Applications in Healthcare\",\n", + " \"AI Programming Languages\",\n", + " \"Ethical Considerations\",\n", + " \"AI in National Security\",\n", + " \"Generative AI\",\n", + " \"Recent Advancements in AI Technologies\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the field of artificial intelligence (AI), its development, applications, and the ethical considerations surrounding its use. It highlights the advancements in AI technologies, the impact on various sectors, and the ongoing debates regarding the implications of AI on society.\",\n", + " \"main_topics\": [\n", + " \"Definition of Artificial Intelligence\",\n", + " \"History and Development of AI\",\n", + " \"Applications of AI\",\n", + " \"Ethical Considerations in AI\",\n", + " \"Impact of AI on Employment\",\n", + " \"Governance and Regulation of AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article provides an overview of artificial intelligence (AI), its history, development, and various applications. It discusses the evolution of AI from its inception to its current state, highlighting key milestones and influential figures in the field. The article also addresses the philosophical implications of AI, its impact on society, and the ongoing debates surrounding its future.\",\n", + " \"main_topics\": [\n", + " \"History of AI\",\n", + " \"Key figures in AI development\",\n", + " \"Philosophical implications of AI\",\n", + " \"Applications of AI\",\n", + " \"Current trends in AI\",\n", + " \"Ethical considerations in AI\",\n", + " \"Future of AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses various aspects of artificial intelligence (AI), including its implications, challenges, and the need for regulatory frameworks to ensure ethical use. It highlights the perspectives of experts on the responsibilities of tech companies and governments in managing AI technologies.\",\n", + " \"main_topics\": [\n", + " \"Ethical implications of AI\",\n", + " \"Regulatory frameworks for AI\",\n", + " \"Transparency in AI systems\",\n", + " \"Compensation for data usage\",\n", + " \"Professional licensing for AI engineers\",\n", + " \"Limitations of natural language processing\",\n", + " \"AI in media and misinformation\",\n", + " \"AI technologies and their reliability\",\n", + " \"Generative AI and its understanding\",\n", + " \"AI applications in various fields\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the field of artificial intelligence (AI), its history, development, and various applications. It highlights the concerns and ethical considerations surrounding AI, as well as the potential impact on society and the economy.\",\n", + " \"main_topics\": [\n", + " \"History of AI\",\n", + " \"Applications of AI\",\n", + " \"Ethical considerations\",\n", + " \"Impact on society\",\n", + " \"Machine learning\",\n", + " \"Regulation of AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"The article discusses the field of artificial intelligence (AI), covering its history, development, and various applications. It highlights the advancements in AI technologies, the ethical implications, and the ongoing debates surrounding AI's impact on society.\",\n", + " \"main_topics\": [\n", + " \"History of AI\",\n", + " \"Development of AI technologies\",\n", + " \"Applications of AI\",\n", + " \"Ethical implications of AI\",\n", + " \"Debates on AI's societal impact\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction. AI applications include expert systems, natural language processing, speech recognition, and machine vision.\",\n", + " \"main_topics\": [\n", + " \"Neural networks\",\n", + " \"Deep learning\",\n", + " \"Language models\",\n", + " \"Artificial general intelligence (AGI)\",\n", + " \"Computer vision\",\n", + " \"Speech recognition\",\n", + " \"Natural language processing\",\n", + " \"Robotics\",\n", + " \"Philosophy of artificial intelligence\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction.\",\n", + " \"main_topics\": [\n", + " \"Definition of AI\",\n", + " \"Processes involved in AI\",\n", + " \"Applications of AI\",\n", + " \"Types of AI\",\n", + " \"Ethical considerations in AI\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial Intelligence\",\n", + " \"summary\": \"Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. These processes include learning, reasoning, and self-correction. AI applications include expert systems, natural language processing, speech recognition, and machine vision.\",\n", + " \"main_topics\": [\n", + " \"Natural language processing\",\n", + " \"Knowledge representation and reasoning\",\n", + " \"Computer vision\",\n", + " \"Automated planning and scheduling\",\n", + " \"Search methodology\",\n", + " \"Control method\",\n", + " \"Philosophy of artificial intelligence\",\n", + " \"Distributed artificial intelligence\",\n", + " \"Machine learning\"\n", + " ],\n", + " \"error\": false\n", + " },\n", + " {\n", + " \"title\": \"Artificial intelligence\",\n", + " \"summary\": \"Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of \\\"intelligent agents\\\": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Colloquially, the term \\\"artificial intelligence\\\" is often used to describe machines (or computers) that mimic \\\"cognitive\\\" functions that humans associate with the human mind, such as \\\"learning\\\" and \\\"problem-solving\\\".\",\n", + " \"main_topics\": [\n", + " \"Automation\",\n", + " \"Ethics of technology\",\n", + " \"AI alignment\",\n", + " \"AI safety\",\n", + " \"Technological singularity\",\n", + " \"Machine ethics\",\n", + " \"Existential risk from artificial intelligence\",\n", + " \"Artificial general intelligence\",\n", + " \"AI takeover\",\n", + " \"AI capability control\"\n", + " ],\n", + " \"error\": false\n", + " }\n", + "]\n", + "\n", + "============================================================\n", + "βœ… Enhanced LLM Demo Complete!\n", + "\n" + ] + } + ], + "source": [ + "from crawl4ai import LLMExtractionStrategy, LLMConfig\n", + "from pydantic import BaseModel, Field\n", + "import os\n", + "\n", + "# Define extraction schema\n", + "class Article(BaseModel):\n", + " title: str = Field(description=\"Article title\")\n", + " summary: str = Field(description=\"Brief summary of the article\")\n", + " main_topics: list[str] = Field(description=\"List of main topics covered\")\n", + "\n", + "async def demo_enhanced_llm():\n", + " \"\"\"\n", + " Demonstrate enhanced LLM integration with custom temperature\n", + " \"\"\"\n", + " print(\"πŸ€– Testing Enhanced LLM Integration\\n\")\n", + " print(\"=\" * 60)\n", + " \n", + " # Check for API key\n", + " api_key = os.getenv('OPENAI_API_KEY')\n", + " if not api_key:\n", + " print(\"⚠️ Note: Set OPENAI_API_KEY environment variable to test LLM extraction\")\n", + " print(\"For this demo, we'll show the configuration only.\\n\")\n", + " \n", + " print(\"πŸ“ Example LLM Configuration with new v0.7.5 features:\")\n", + " print(\"\"\"\n", + "llm_strategy = LLMExtractionStrategy(\n", + " llm_config=LLMConfig(\n", + " provider=\"openai/gpt-4o-mini\",\n", + " api_token=\"your-api-key\",\n", + " temperature=0.7, # πŸ†• NEW: Control creativity (0.0-2.0)\n", + " base_url=\"custom-endpoint\" # πŸ†• NEW: Custom API endpoint\n", + " ),\n", + " schema=Article.schema(),\n", + " extraction_type=\"schema\",\n", + " instruction=\"Extract article information\"\n", + ")\n", + " \"\"\")\n", + " return\n", + " \n", + " # Create LLM extraction strategy with custom temperature\n", + " llm_strategy = LLMExtractionStrategy(\n", + " llm_config=LLMConfig(\n", + " provider=\"openai/gpt-4o-mini\",\n", + " api_token=api_key,\n", + " temperature=0.3, # πŸ†• Lower temperature for more focused extraction\n", + " ),\n", + " schema=Article.schema(),\n", + " extraction_type=\"schema\",\n", + " instruction=\"Extract the article title, a brief summary, and main topics discussed.\"\n", + " )\n", + " \n", + " config = CrawlerRunConfig(\n", + " extraction_strategy=llm_strategy,\n", + " cache_mode=CacheMode.BYPASS\n", + " )\n", + " \n", + " async with AsyncWebCrawler() as crawler:\n", + " result = await crawler.arun(\n", + " url=\"https://en.wikipedia.org/wiki/Artificial_intelligence\",\n", + " config=config\n", + " )\n", + " \n", + " if result.success:\n", + " print(\"\\nβœ… LLM Extraction Successful!\")\n", + " print(f\"\\nπŸ“„ Extracted Content:\")\n", + " print(result.extracted_content)\n", + " else:\n", + " print(f\"\\n❌ Extraction failed: {result.error_message}\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"βœ… Enhanced LLM Demo Complete!\\n\")\n", + "\n", + "# Run the demo\n", + "await demo_enhanced_llm()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## πŸ”§ Feature 3: Docker Hooks System (NEW! πŸ†•)\n", + "\n", + "### What is it?\n", + "v0.7.5 introduces a **completely new Docker Hooks System** that lets you inject custom Python functions at 8 key points in the crawling pipeline. This gives you full control over:\n", + "- Authentication setup\n", + "- Performance optimization\n", + "- Content processing\n", + "- Custom behavior at each stage\n", + "\n", + "### Three Ways to Use Docker Hooks\n", + "\n", + "The Docker Hooks System offers three approaches, all part of this new feature:\n", + "\n", + "1. **String-based hooks** - Write hooks as strings for REST API\n", + "2. **Using `hooks_to_string()` utility** - Convert Python functions to strings\n", + "3. **Docker Client auto-conversion** - Pass functions directly (most convenient)\n", + "\n", + "All three approaches are NEW in v0.7.5!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating Reusable Hook Functions\n", + "\n", + "First, let's create some hook functions that we can reuse:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Reusable hook library created!\n", + "\n", + "πŸ“š Available hooks:\n", + " β€’ block_images_hook - Speed optimization\n", + " β€’ set_viewport_hook - Consistent rendering\n", + " β€’ add_custom_headers_hook - Custom headers\n", + " β€’ scroll_page_hook - Lazy content loading\n", + " β€’ log_page_metrics_hook - Page analytics\n" + ] + } + ], + "source": [ + "# Define reusable hooks as Python functions\n", + "\n", + "async def block_images_hook(page, context, **kwargs):\n", + " \"\"\"\n", + " Performance optimization: Block images to speed up crawling\n", + " \"\"\"\n", + " print(\"[Hook] Blocking images for faster loading...\")\n", + " await context.route(\n", + " \"**/*.{png,jpg,jpeg,gif,webp,svg,ico}\",\n", + " lambda route: route.abort()\n", + " )\n", + " return page\n", + "\n", + "async def set_viewport_hook(page, context, **kwargs):\n", + " \"\"\"\n", + " Set consistent viewport size for rendering\n", + " \"\"\"\n", + " print(\"[Hook] Setting viewport to 1920x1080...\")\n", + " await page.set_viewport_size({\"width\": 1920, \"height\": 1080})\n", + " return page\n", + "\n", + "async def add_custom_headers_hook(page, context, url, **kwargs):\n", + " \"\"\"\n", + " Add custom headers before navigation\n", + " \"\"\"\n", + " print(f\"[Hook] Adding custom headers for {url}...\")\n", + " await page.set_extra_http_headers({\n", + " 'X-Crawl4AI-Version': '0.7.5',\n", + " 'X-Custom-Header': 'docker-hooks-demo',\n", + " 'Accept-Language': 'en-US,en;q=0.9'\n", + " })\n", + " return page\n", + "\n", + "async def scroll_page_hook(page, context, **kwargs):\n", + " \"\"\"\n", + " Scroll page to load lazy-loaded content\n", + " \"\"\"\n", + " print(\"[Hook] Scrolling page to load lazy content...\")\n", + " await page.evaluate(\"window.scrollTo(0, document.body.scrollHeight)\")\n", + " await page.wait_for_timeout(1000)\n", + " await page.evaluate(\"window.scrollTo(0, 0)\")\n", + " await page.wait_for_timeout(500)\n", + " return page\n", + "\n", + "async def log_page_metrics_hook(page, context, **kwargs):\n", + " \"\"\"\n", + " Log page metrics before extracting HTML\n", + " \"\"\"\n", + " metrics = await page.evaluate('''\n", + " () => ({\n", + " images: document.images.length,\n", + " links: document.links.length,\n", + " scripts: document.scripts.length,\n", + " title: document.title\n", + " })\n", + " ''')\n", + " print(f\"[Hook] Page Metrics - Title: {metrics['title']}\")\n", + " print(f\" Images: {metrics['images']}, Links: {metrics['links']}, Scripts: {metrics['scripts']}\")\n", + " return page\n", + "\n", + "print(\"βœ… Reusable hook library created!\")\n", + "print(\"\\nπŸ“š Available hooks:\")\n", + "print(\" β€’ block_images_hook - Speed optimization\")\n", + "print(\" β€’ set_viewport_hook - Consistent rendering\")\n", + "print(\" β€’ add_custom_headers_hook - Custom headers\")\n", + "print(\" β€’ scroll_page_hook - Lazy content loading\")\n", + "print(\" β€’ log_page_metrics_hook - Page analytics\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using hooks_to_string() Utility\n", + "\n", + "The new `hooks_to_string()` utility converts Python function objects to strings that can be sent to the Docker API:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… Converted 3 hook functions to string format\n", + "\n", + "πŸ“ Example of converted hook (first 200 chars):\n", + "async def block_images_hook(page, context, **kwargs):\n", + " \"\"\"\n", + " Performance optimization: Block images to speed up crawling\n", + " \"\"\"\n", + " print(\"[Hook] Blocking images for faster loading...\")\n", + " awai...\n", + "\n", + "πŸ’‘ Benefits of hooks_to_string():\n", + " βœ“ Write hooks as Python functions (IDE support, type checking)\n", + " βœ“ Automatically converts to string format for Docker API\n", + " βœ“ Reusable across projects\n", + " βœ“ Easy to test and debug\n" + ] + } + ], + "source": [ + "# Convert functions to strings using the NEW utility\n", + "hooks_as_strings = hooks_to_string({\n", + " \"on_page_context_created\": block_images_hook,\n", + " \"before_goto\": add_custom_headers_hook,\n", + " \"before_retrieve_html\": scroll_page_hook,\n", + "})\n", + "\n", + "print(\"βœ… Converted 3 hook functions to string format\")\n", + "print(\"\\nπŸ“ Example of converted hook (first 200 chars):\")\n", + "print(hooks_as_strings[\"on_page_context_created\"][:200] + \"...\")\n", + "\n", + "print(\"\\nπŸ’‘ Benefits of hooks_to_string():\")\n", + "print(\" βœ“ Write hooks as Python functions (IDE support, type checking)\")\n", + "print(\" βœ“ Automatically converts to string format for Docker API\")\n", + "print(\" βœ“ Reusable across projects\")\n", + "print(\" βœ“ Easy to test and debug\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8 Available Hook Points\n", + "\n", + "The Docker Hooks System provides 8 strategic points where you can inject custom behavior:\n", + "\n", + "1. **on_browser_created** - Browser initialization\n", + "2. **on_page_context_created** - Page context setup\n", + "3. **on_user_agent_updated** - User agent configuration\n", + "4. **before_goto** - Pre-navigation setup\n", + "5. **after_goto** - Post-navigation processing\n", + "6. **on_execution_started** - JavaScript execution start\n", + "7. **before_retrieve_html** - Pre-extraction processing\n", + "8. **before_return_html** - Final HTML processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Complete Docker Hooks Demo\n", + "\n", + "**Note**: For a complete demonstration of all Docker Hooks approaches including:\n", + "- String-based hooks with REST API\n", + "- hooks_to_string() utility usage\n", + "- Docker Client with automatic conversion\n", + "- Complete pipeline with all 8 hook points\n", + "\n", + "See the separate file: **`v0.7.5_docker_hooks_demo.py`**\n", + "\n", + "This standalone Python script provides comprehensive, runnable examples of the entire Docker Hooks System." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## πŸ› οΈ Feature 4: Bug Fixes Summary\n", + "\n", + "### Major Fixes in v0.7.5\n", + "\n", + "1. **URL Processing** - Fixed '+' sign preservation in query parameters\n", + "2. **Proxy Configuration** - Enhanced proxy string parsing (old parameter deprecated)\n", + "3. **Docker Error Handling** - Better error messages with status codes\n", + "4. **Memory Management** - Fixed leaks in long-running sessions\n", + "5. **JWT Authentication** - Fixed Docker JWT validation\n", + "6. **Playwright Stealth** - Fixed stealth features\n", + "7. **API Configuration** - Fixed config handling\n", + "8. **Deep Crawl Strategy** - Resolved JSON encoding errors\n", + "9. **LLM Provider Support** - Fixed custom provider integration\n", + "10. **Performance** - Resolved backoff strategy failures\n", + "\n", + "### New Proxy Configuration Example" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… New proxy configuration format demonstrated\n", + "\n", + "πŸ“ Benefits:\n", + " β€’ More explicit and clear\n", + " β€’ Better authentication support\n", + " β€’ Consistent with industry standards\n" + ] + } + ], + "source": [ + "# OLD WAY (Deprecated)\n", + "# browser_config = BrowserConfig(proxy=\"http://proxy:8080\")\n", + "\n", + "# NEW WAY (v0.7.5)\n", + "browser_config_with_proxy = BrowserConfig(\n", + " proxy_config={\n", + " \"server\": \"http://proxy.example.com:8080\",\n", + " \"username\": \"optional-username\", # Optional\n", + " \"password\": \"optional-password\" # Optional\n", + " }\n", + ")\n", + "\n", + "print(\"βœ… New proxy configuration format demonstrated\")\n", + "print(\"\\nπŸ“ Benefits:\")\n", + "print(\" β€’ More explicit and clear\")\n", + "print(\" β€’ Better authentication support\")\n", + "print(\" β€’ Consistent with industry standards\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## 🎯 Complete Example: Combining Multiple Features\n", + "\n", + "Let's create a real-world example that uses multiple v0.7.5 features together:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🎯 Complete v0.7.5 Feature Demo\n", + "\n", + "============================================================\n", + "\n", + "1️⃣ Using Docker Hooks System (NEW!)\n", + " βœ“ Converted 3 hooks to string format\n", + " βœ“ Ready to send to Docker API\n", + "\n", + "2️⃣ Enabling HTTPS Preservation\n", + " βœ“ HTTPS preservation enabled\n", + "\n", + "3️⃣ Using New Proxy Configuration Format\n", + " βœ“ New proxy config format ready\n", + "\n", + "4️⃣ Executing Crawl with All Features\n" + ] + }, + { + "data": { + "text/html": [ + "
[INIT].... β†’ Crawl4AI 0.7.5 \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;36m[\u001b[0m\u001b[36mINIT\u001b[0m\u001b[1;36m]\u001b[0m\u001b[36m...\u001b[0m\u001b[36m. β†’ Crawl4AI \u001b[0m\u001b[1;36m0.7\u001b[0m\u001b[36m.\u001b[0m\u001b[1;36m5\u001b[0m\u001b[36m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[FETCH]... ↓ https://example.com                                                                                  |\n",
+       "βœ“ | ⏱: 1.29s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mFETCH\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m...\u001b[0m\u001b[32m ↓ \u001b[0m\u001b[4;32mhttps://example.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m1.\u001b[0m\u001b[32m29s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[SCRAPE].. β—† https://example.com                                                                                  |\n",
+       "βœ“ | ⏱: 0.00s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mSCRAPE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m.. β—† \u001b[0m\u001b[4;32mhttps://example.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m0.\u001b[0m\u001b[32m00s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[COMPLETE] ● https://example.com                                                                                  |\n",
+       "βœ“ | ⏱: 1.29s \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;32m[\u001b[0m\u001b[32mCOMPLETE\u001b[0m\u001b[1;32m]\u001b[0m\u001b[32m ● \u001b[0m\u001b[4;32mhttps://example.com\u001b[0m\u001b[32m |\u001b[0m\n", + "\u001b[32mβœ“\u001b[0m\u001b[32m | ⏱: \u001b[0m\u001b[1;32m1.\u001b[0m\u001b[32m29s \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " βœ“ Crawl successful!\n", + "\n", + "πŸ“Š Results:\n", + " β€’ Pages crawled: 1\n", + " β€’ Title: Example Domain\n", + " β€’ Content length: 119 characters\n", + " β€’ Links found: 0\n", + "\n", + "============================================================\n", + "βœ… Complete Feature Demo Finished!\n", + "\n" + ] + } + ], + "source": [ + "async def complete_demo():\n", + " \"\"\"\n", + " Comprehensive demo combining multiple v0.7.5 features\n", + " \"\"\"\n", + " print(\"🎯 Complete v0.7.5 Feature Demo\\n\")\n", + " print(\"=\" * 60)\n", + " \n", + " # Use function-based hooks (NEW Docker Hooks System)\n", + " print(\"\\n1️⃣ Using Docker Hooks System (NEW!)\")\n", + " hooks = {\n", + " \"on_page_context_created\": set_viewport_hook,\n", + " \"before_goto\": add_custom_headers_hook,\n", + " \"before_retrieve_html\": log_page_metrics_hook\n", + " }\n", + " \n", + " # Convert to strings using the NEW utility\n", + " hooks_strings = hooks_to_string(hooks)\n", + " print(f\" βœ“ Converted {len(hooks_strings)} hooks to string format\")\n", + " print(\" βœ“ Ready to send to Docker API\")\n", + " \n", + " # Use HTTPS preservation\n", + " print(\"\\n2️⃣ Enabling HTTPS Preservation\")\n", + " url_filter = URLPatternFilter(\n", + " patterns=[r\"^(https:\\/\\/)?example\\.com(\\/.*)?$\"]\n", + " )\n", + " \n", + " config = CrawlerRunConfig(\n", + " exclude_external_links=True,\n", + " preserve_https_for_internal_links=True, # v0.7.5 feature\n", + " cache_mode=CacheMode.BYPASS,\n", + " deep_crawl_strategy=BFSDeepCrawlStrategy(\n", + " max_depth=1,\n", + " max_pages=3,\n", + " filter_chain=FilterChain([url_filter])\n", + " )\n", + " )\n", + " print(\" βœ“ HTTPS preservation enabled\")\n", + " \n", + " # Use new proxy config format\n", + " print(\"\\n3️⃣ Using New Proxy Configuration Format\")\n", + " browser_config = BrowserConfig(\n", + " headless=True,\n", + " # proxy_config={ # Uncomment if you have a proxy\n", + " # \"server\": \"http://proxy:8080\"\n", + " # }\n", + " )\n", + " print(\" βœ“ New proxy config format ready\")\n", + " \n", + " # Run the crawl\n", + " print(\"\\n4️⃣ Executing Crawl with All Features\")\n", + " async with AsyncWebCrawler(config=browser_config) as crawler:\n", + " # With deep_crawl_strategy, returns a list\n", + " results = await crawler.arun(\n", + " url=\"https://example.com\",\n", + " config=config\n", + " )\n", + " \n", + " if results and len(results) > 0:\n", + " result = results[0] # Get first result\n", + " print(\" βœ“ Crawl successful!\")\n", + " print(f\"\\nπŸ“Š Results:\")\n", + " print(f\" β€’ Pages crawled: {len(results)}\")\n", + " print(f\" β€’ Title: {result.metadata.get('title', 'N/A')}\")\n", + " print(f\" β€’ Content length: {len(result.markdown.raw_markdown)} characters\")\n", + " print(f\" β€’ Links found: {len(result.links['internal']) + len(result.links['external'])}\")\n", + " else:\n", + " print(f\" ⚠️ No results returned\")\n", + " \n", + " print(\"\\n\" + \"=\" * 60)\n", + " print(\"βœ… Complete Feature Demo Finished!\\n\")\n", + "\n", + "# Run complete demo\n", + "await complete_demo()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "## πŸŽ“ Summary\n", + "\n", + "### What We Covered\n", + "\n", + "βœ… **HTTPS Preservation** - Maintain secure protocols throughout crawling \n", + "βœ… **Enhanced LLM Integration** - Custom temperature and provider configuration \n", + "βœ… **Docker Hooks System (NEW!)** - Complete pipeline customization with 3 approaches \n", + "βœ… **hooks_to_string() Utility (NEW!)** - Convert functions for Docker API \n", + "βœ… **Bug Fixes** - New proxy config and multiple improvements \n", + "\n", + "### Key Highlight: Docker Hooks System 🌟\n", + "\n", + "The **Docker Hooks System** is completely NEW in v0.7.5. It offers:\n", + "- 8 strategic hook points in the pipeline\n", + "- 3 ways to use hooks (strings, utility, auto-conversion)\n", + "- Full control over crawling behavior\n", + "- Support for authentication, optimization, and custom processing\n", + "\n", + "### Next Steps\n", + "\n", + "1. **Docker Hooks Demo** - See `v0.7.5_docker_hooks_demo.py` for complete Docker Hooks examples\n", + "2. **Documentation** - Visit [docs.crawl4ai.com](https://docs.crawl4ai.com) for full reference\n", + "3. **Examples** - Check [GitHub examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples)\n", + "4. **Community** - Join [Discord](https://discord.gg/jP8KfhDhyN) for support\n", + "\n", + "---\n", + "\n", + "## πŸ“š Resources\n", + "\n", + "- πŸ“– [Full Documentation](https://docs.crawl4ai.com)\n", + "- πŸ™ [GitHub Repository](https://github.com/unclecode/crawl4ai)\n", + "- πŸ“ [Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)\n", + "- πŸ’¬ [Discord Community](https://discord.gg/jP8KfhDhyN)\n", + "- 🐦 [Twitter](https://x.com/unclecode)\n", + "\n", + "---\n", + "\n", + "**Happy Crawling with v0.7.5! πŸš€**" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}