diff --git a/CHANGELOG.md b/CHANGELOG.md index d62d8775..a9d363c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,7 @@ This release introduces several powerful new features, including robots.txt comp - **URL Redirection Tracking:** - Added URL redirection tracking to capture the final URL after any redirects. - - The final URL is now available in the `final_url` field of the `AsyncCrawlResponse` object. + - The final URL is now available in the `redirected_url` field of the `AsyncCrawlResponse` object. - **Enhanced Streamlined Documentation:** - Refactored and improved the documentation structure for clarity and ease of use. diff --git a/README.md b/README.md index 9cfe4512..1bcaf910 100644 --- a/README.md +++ b/README.md @@ -492,7 +492,7 @@ async def test_news_crawl(): - **ποΈ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental). - **π€ robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching. - **π Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence. -- **β‘οΈ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects. +- **β‘οΈ URL Redirection Tracking**: The `redirected_url` field now captures the final destination after any redirects. - **πͺ Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites. - **π Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`. - **π Improved Documentation**: More examples, clearer explanations, and updated tutorials. diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a2bb7b96..738dfb51 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1254,7 +1254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): config.url = url response_headers = {} status_code = None - final_url = url + redirected_url = url # Reset downloaded files list for new crawl self._downloaded_files = [] @@ -1336,7 +1336,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response = await page.goto( url, wait_until=config.wait_until, timeout=config.page_timeout ) - final_url = page.url + redirected_url = page.url except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") @@ -1616,7 +1616,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): downloaded_files=( self._downloaded_files if self._downloaded_files else None ), - final_url=final_url, + redirected_url=redirected_url, ) except Exception as e: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index dc7e2cb9..617b6901 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -462,7 +462,7 @@ class AsyncWebCrawler: ) crawl_result.status_code = async_response.status_code - crawl_result.redirected_url = async_response.final_url or url + crawl_result.redirected_url = async_response.redirected_url or url crawl_result.response_headers = async_response.response_headers crawl_result.downloaded_files = async_response.downloaded_files crawl_result.ssl_certificate = ( diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 81e08b0c..57edacd7 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -132,7 +132,7 @@ class AsyncCrawlResponse(BaseModel): get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None downloaded_files: Optional[List[str]] = None ssl_certificate: Optional[SSLCertificate] = None - final_url: Optional[str] = None + redirected_url: Optional[str] = None class Config: arbitrary_types_allowed = True diff --git a/docs/examples/v0_4_3_features_demo.py b/docs/examples/v0_4_3_features_demo.py index 033bf30f..9406b50d 100644 --- a/docs/examples/v0_4_3_features_demo.py +++ b/docs/examples/v0_4_3_features_demo.py @@ -2,54 +2,96 @@ Crawl4ai v0.4.3 Features Demo ============================ -This example demonstrates the major new features introduced in Crawl4ai v0.4.3. -Each section showcases a specific feature with practical examples and explanations. +This demonstration showcases three major categories of new features in Crawl4ai v0.4.3: + +1. Efficiency & Speed: + - Memory-efficient dispatcher strategies + - New scraping algorithm + - Streaming support for batch crawling + +2. LLM Integration: + - Automatic schema generation + - LLM-powered content filtering + - Smart markdown generation + +3. Core Improvements: + - Robots.txt compliance + - Proxy rotation + - Enhanced URL handling + +Each demo function can be run independently or as part of the full suite. """ import asyncio import os -from crawl4ai import * +import json +import re +import random +from typing import Optional, Dict +from dotenv import load_dotenv + +load_dotenv() + +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + DisplayMode, + MemoryAdaptiveDispatcher, + CrawlerMonitor, + DefaultMarkdownGenerator, + LXMLWebScrapingStrategy, + JsonCssExtractionStrategy, + LLMContentFilter +) async def demo_memory_dispatcher(): + """Demonstrates the new memory-efficient dispatcher system. + + Key Features: + - Adaptive memory management + - Real-time performance monitoring + - Concurrent session control """ - 1. Memory Dispatcher System Demo - =============================== - Shows how to use the new memory dispatcher with monitoring - """ - print("\n=== 1. Memory Dispatcher System Demo ===") - - # Configure crawler - browser_config = BrowserConfig(headless=True, verbose=True) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator() - ) - - # Test URLs - urls = ["http://example.com", "http://example.org", "http://example.net"] * 3 - - async with AsyncWebCrawler(config=browser_config) as crawler: - # Initialize dispatcher with monitoring - monitor = CrawlerMonitor( - max_visible_rows=10, - display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED + print("\n=== Memory Dispatcher Demo ===") + + try: + # Configuration + browser_config = BrowserConfig(headless=True, verbose=False) + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator() ) - dispatcher = MemoryAdaptiveDispatcher( - memory_threshold_percent=80.0, # Memory usage threshold - check_interval=0.5, # How often to check memory - max_session_permit=5, # Max concurrent crawls - monitor=monitor, # Pass the monitor - ) + # Test URLs + urls = ["http://example.com", "http://example.org", "http://example.net"] * 3 - # Run with memory monitoring - print("Starting batch crawl with memory monitoring...") - results = await dispatcher.run_urls( - urls=urls, - crawler=crawler, - config=crawler_config, - ) - print(f"Completed {len(results)} URLs") + print("\nπ Initializing crawler with memory monitoring...") + async with AsyncWebCrawler(config=browser_config) as crawler: + monitor = CrawlerMonitor( + max_visible_rows=10, + display_mode=DisplayMode.DETAILED + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, + check_interval=0.5, + max_session_permit=5, + monitor=monitor + ) + + print("\nπ Starting batch crawl...") + results = await dispatcher.run_urls( + urls=urls, + crawler=crawler, + config=crawler_config, + ) + print(f"\nβ Completed {len(results)} URLs successfully") + + except Exception as e: + print(f"\nβ Error in memory dispatcher demo: {str(e)}") async def demo_streaming_support(): @@ -60,7 +102,7 @@ async def demo_streaming_support(): """ print("\n=== 2. Streaming Support Demo ===") - browser_config = BrowserConfig(headless=True, verbose=True) + browser_config = BrowserConfig(headless=True, verbose=False) crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True) # Test URLs @@ -179,7 +221,7 @@ async def demo_robots_compliance(): -async def demo_llm_schema_generation(): +async def demo_json_schema_generation(): """ 7. LLM-Powered Schema Generation Demo ================================= @@ -233,25 +275,6 @@ async def demo_llm_schema_generation(): print("Successfully used generated schema for crawling") -async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]: - """Get next proxy from local file""" - try: - with open(proxy_file) as f: - proxies = f.read().splitlines() - if not proxies: - return None - - ip, port, username, password = random.choice(proxies).split(":") - return { - "server": f"http://{ip}:{port}", - "username": username, - "password": password, - "ip": ip # Store original IP for verification - } - except Exception as e: - print(f"Error loading proxy: {e}") - return None - async def demo_proxy_rotation(): """ 8. Proxy Rotation Demo @@ -259,12 +282,28 @@ async def demo_proxy_rotation(): Demonstrates how to rotate proxies for each request using Crawl4ai. """ print("\n=== 8. Proxy Rotation Demo ===") + + async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]: + """Get next proxy from local file""" + try: + proxies = os.getenv("PROXIES", "").split(",") + + ip, port, username, password = random.choice(proxies).split(":") + return { + "server": f"http://{ip}:{port}", + "username": username, + "password": password, + "ip": ip # Store original IP for verification + } + except Exception as e: + print(f"Error loading proxy: {e}") + return None # Create 10 test requests to httpbin - urls = ["https://httpbin.org/ip"] * 3 + urls = ["https://httpbin.org/ip"] * 2 - browser_config = BrowserConfig(headless=True) + browser_config = BrowserConfig(headless=True, verbose=False) run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) async with AsyncWebCrawler(config=browser_config) as crawler: @@ -289,24 +328,25 @@ async def demo_proxy_rotation(): else: print(f"Failed with proxy {proxy['ip']}") -if __name__ == "__main__": - async def main(): """Run all feature demonstrations.""" - demo_memory_dispatcher(), - print("\n" + "=" * 50 + "\n") - demo_streaming_support(), - print("\n" + "=" * 50 + "\n") - demo_content_scraping(), - print("\n" + "=" * 50 + "\n") - demo_llm_schema_generation(), - print("\n" + "=" * 50 + "\n") - demo_llm_markdown(), - print("\n" + "=" * 50 + "\n") - demo_robots_compliance(), - print("\n" + "=" * 50 + "\n") - demo_proxy_rotation() - print("\n" + "=" * 50 + "\n") + print("\nπ Running Crawl4ai v0.4.3 Feature Demos\n") + + # Efficiency & Speed Demos + # print("\nπ EFFICIENCY & SPEED DEMOS") + # await demo_memory_dispatcher() + # await demo_streaming_support() + # await demo_content_scraping() + + # # LLM Integration Demos + # print("\nπ€ LLM INTEGRATION DEMOS") + # await demo_json_schema_generation() + # await demo_llm_markdown() + + # # Core Improvements + # print("\nπ§ CORE IMPROVEMENT DEMOS") + # await demo_robots_compliance() + await demo_proxy_rotation() if __name__ == "__main__": asyncio.run(main()) diff --git a/docs/md_v2/blog/releases/v0.4.3b1.md b/docs/md_v2/blog/releases/v0.4.3b1.md index f648b462..9b027dd6 100644 --- a/docs/md_v2/blog/releases/v0.4.3b1.md +++ b/docs/md_v2/blog/releases/v0.4.3b1.md @@ -1,266 +1,138 @@ -# Crawl4AI 0.4.3b1 is Here: Faster, Smarter, and Ready for Real-World Crawling! +# Crawl4AI 0.4.3: Major Performance Boost & LLM Integration -Hey, Crawl4AI enthusiasts! We're thrilled to announce the release of **Crawl4AI 0.4.3b1**, packed with powerful new features and enhancements that take web crawling to a whole new level of efficiency and intelligence. This release is all about giving you more control, better performance, and deeper insights into your crawled data. +We're excited to announce Crawl4AI 0.4.3, focusing on three key areas: Speed & Efficiency, LLM Integration, and Core Platform Improvements. This release significantly improves crawling performance while adding powerful new LLM-powered features. -Let's dive into what's new! +## β‘ Speed & Efficiency Improvements -## π Major Feature Highlights - -### 1. LLM-Powered Schema Generation: Zero to Structured Data in Seconds! - -Tired of manually crafting CSS or XPath selectors? We've got you covered! Crawl4AI now features a revolutionary **schema generator** that uses the power of Large Language Models (LLMs) to automatically create extraction schemas for you. - -**How it Works:** - -1. **Provide HTML**: Feed in a sample HTML snippet that contains the type of data you want to extract (e.g., product listings, article sections). -2. **Describe Your Needs (Optional)**: You can provide a natural language query like "extract all product names and prices" to guide the schema creation. -3. **Choose Your LLM**: Use either **OpenAI** (GPT-4o recommended) for top-tier accuracy or **Ollama** for a local, open-source option. -4. **Get Your Schema**: The tool outputs a ready-to-use JSON schema that works seamlessly with `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`. - -**Why You'll Love It:** - -- **No More Tedious Selector Writing**: Let the LLM analyze the HTML and create the selectors for you! -- **One-Time Cost**: Schema generation uses LLM, but once you have your schema, subsequent extractions are fast and LLM-free. -- **Handles Complex Structures**: The LLM can understand nested elements, lists, and variations in layoutβfar beyond what simple CSS selectors can achieve. -- **Learn by Example**: The generated schemas are a fantastic way to learn best practices for writing your own schemas. - -**Example:** +### 1. Memory-Adaptive Dispatcher System +The new dispatcher system provides intelligent resource management and real-time monitoring: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - -# Sample HTML snippet (imagine this is part of a product listing page) -html = """ -