feat(release): prepare v0.4.3 beta release

Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit
2025-01-21 21:03:11 +08:00
parent d09c611d15
commit 16b8d4945b
12 changed files with 885 additions and 287 deletions
--- a/docs/examples/dispatcher_example.py
+++ b/docs/examples/dispatcher_example.py
@@ -12,6 +12,7 @@ from crawl4ai import (
    CrawlerMonitor,
    DisplayMode,
    CacheMode,
+    LXMLWebScrapingStrategy,
 )


@@ -113,7 +114,7 @@ def create_performance_table(results):
 async def main():
    urls = [f"https://example.com/page{i}" for i in range(1, 20)]
    browser_config = BrowserConfig(headless=True, verbose=False)
-    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())

    results = {
        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
--- a/docs/examples/llm_markdown_generator.py
+++ b/docs/examples/llm_markdown_generator.py
@@ -0,0 +1,87 @@
+import os
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def test_llm_filter():
+    # Create an HTML source that needs intelligent filtering
+    url = "https://docs.python.org/3/tutorial/classes.html"
+    
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+    
+    # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # First get the raw HTML
+        result = await crawler.arun(url, config=run_config)
+        html = result.cleaned_html
+
+        # Initialize LLM filter with focused instruction
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            instruction="""
+            Focus on extracting the core educational content about Python classes.
+            Include:
+            - Key concepts and their explanations
+            - Important code examples
+            - Essential technical details
+            Exclude:
+            - Navigation elements
+            - Sidebars
+            - Footer content
+            - Version information
+            - Any non-essential UI elements
+            
+            Format the output as clean markdown with proper code blocks and headers.
+            """,
+            verbose=True
+        )
+        
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
+            instruction="""
+            Extract the main educational content while preserving its original wording and substance completely. Your task is to:
+
+            1. Maintain the exact language and terminology used in the main content
+            2. Keep all technical explanations, examples, and educational content intact
+            3. Preserve the original flow and structure of the core content
+            4. Remove only clearly irrelevant elements like:
+            - Navigation menus
+            - Advertisement sections
+            - Cookie notices
+            - Footers with site information
+            - Sidebars with external links
+            - Any UI elements that don't contribute to learning
+
+            The goal is to create a clean markdown version that reads exactly like the original article, 
+            keeping all valuable content but free from distracting elements. Imagine you're creating 
+            a perfect reading experience where nothing valuable is lost, but all noise is removed.
+            """,
+            verbose=True
+        )        
+
+        # Apply filtering
+        filtered_content = filter.filter_content(html, ignore_cache = True)
+        
+        # Show results
+        print("\nFiltered Content Length:", len(filtered_content))
+        print("\nFirst 500 chars of filtered content:")
+        if filtered_content:
+            print(filtered_content[0][:500])
+        
+        # Save on disc the markdown version
+        with open("filtered_content.md", "w", encoding="utf-8") as f:
+            f.write("\n".join(filtered_content))
+        
+        # Show token usage
+        filter.show_usage()
+
+if __name__ == "__main__":
+    asyncio.run(test_llm_filter())
--- a/docs/examples/scraping_strategies_performance.py
+++ b/docs/examples/scraping_strategies_performance.py
@@ -0,0 +1,135 @@
+import time, re
+from crawl4ai.content_scraping_strategy import WebScrapingStrategy,  LXMLWebScrapingStrategy
+import time
+import functools
+from collections import defaultdict
+
+class TimingStats:
+    def __init__(self):
+        self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
+        
+    def add(self, strategy_name, func_name, elapsed):
+        self.stats[strategy_name][func_name]["calls"] += 1
+        self.stats[strategy_name][func_name]["total_time"] += elapsed
+        
+    def report(self):
+        for strategy_name, funcs in self.stats.items():
+            print(f"\n{strategy_name} Timing Breakdown:")
+            print("-" * 60)
+            print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
+            print("-" * 60)
+            
+            for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
+                avg_ms = (data["total_time"] / data["calls"]) * 1000
+                print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
+
+timing_stats = TimingStats()
+
+# Modify timing decorator
+def timing_decorator(strategy_name):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            start = time.time()
+            result = func(*args, **kwargs)
+            elapsed = time.time() - start
+            timing_stats.add(strategy_name, func.__name__, elapsed)
+            return result
+        return wrapper
+    return decorator
+
+# Modified decorator application
+def apply_decorators(cls, method_name, strategy_name):
+    try:
+        original_method = getattr(cls, method_name)
+        decorated_method = timing_decorator(strategy_name)(original_method)
+        setattr(cls, method_name, decorated_method)
+    except AttributeError:
+        print(f"Method {method_name} not found in class {cls.__name__}.")
+
+# Apply to key methods
+methods_to_profile = [
+    '_scrap',
+    # 'process_element', 
+    '_process_element', 
+    'process_image',
+]
+
+
+# Apply decorators to both strategies
+for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
+    for method in methods_to_profile:
+        apply_decorators(strategy, method, name)
+
+
+def generate_large_html(n_elements=1000):
+    html = ['<!DOCTYPE html><html><head></head><body>']
+    for i in range(n_elements):
+        html.append(f'''
+            <div class="article">
+                <h2>Heading {i}</h2>
+                <div>
+                    <div>
+                        <p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
+                    </div>
+                </div>
+                <img src="image{i}.jpg" alt="Image {i}">
+                <ul>
+                    <li>List item {i}.1</li>
+                    <li>List item {i}.2</li>
+                </ul>
+            </div>
+        ''')
+    html.append('</body></html>')
+    return ''.join(html)
+
+def test_scraping():
+    # Initialize both scrapers
+    original_scraper = WebScrapingStrategy()
+    selected_scraper = LXMLWebScrapingStrategy()
+    
+    # Generate test HTML
+    print("Generating HTML...")
+    html = generate_large_html(5000)
+    print(f"HTML Size: {len(html)/1024:.2f} KB")
+    
+    # Time the scraping
+    print("\nStarting scrape...")
+    start_time = time.time()
+    
+    kwargs = {
+        "url": "http://example.com",
+        "html": html,
+        "word_count_threshold": 5,
+        "keep_data_attributes": True
+    }
+    
+    t1 = time.perf_counter()
+    result_selected = selected_scraper.scrap(**kwargs)
+    t2 = time.perf_counter()
+    
+    result_original = original_scraper.scrap(**kwargs)
+    t3 = time.perf_counter()
+    
+    elapsed = t3 - start_time
+    print(f"\nScraping completed in {elapsed:.2f} seconds")
+    
+    timing_stats.report()
+    
+    # Print stats of LXML output
+    print("\nLXML Output:")
+    print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}")
+    print(f"Extracted images: {len(result_selected['media']['images'])}")
+    print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB")
+    print(f"Scraping time: {t2 - t1:.2f} seconds")
+
+    # Print stats of original output
+    print("\nOriginal Output:")
+    print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}")
+    print(f"Extracted images: {len(result_original['media']['images'])}")
+    print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB")
+    print(f"Scraping time: {t3 - t1:.2f} seconds")
+        
+        
+if __name__ == "__main__":
+    test_scraping()
--- a/docs/examples/v0_4_3_features_demo.py
+++ b/docs/examples/v0_4_3_features_demo.py
@@ -0,0 +1,252 @@
+"""
+Crawl4ai v0.4.3 Features Demo
+============================
+
+This example demonstrates the major new features introduced in Crawl4ai v0.4.3.
+Each section showcases a specific feature with practical examples and explanations.
+"""
+
+import asyncio
+import os
+from crawl4ai import *
+
+
+async def demo_memory_dispatcher():
+    """
+    1. Memory Dispatcher System Demo
+    ===============================
+    Shows how to use the new memory dispatcher with monitoring
+    """
+    print("\n=== 1. Memory Dispatcher System Demo ===")
+
+    # Configure crawler
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()
+    )
+
+    # Test URLs
+    urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Initialize dispatcher with monitoring
+        monitor = CrawlerMonitor(
+            max_visible_rows=10,
+            display_mode=DisplayMode.DETAILED,  # Can be DETAILED or AGGREGATED
+        )
+
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=80.0,  # Memory usage threshold
+            check_interval=0.5,  # How often to check memory
+            max_session_permit=5,  # Max concurrent crawls
+            monitor=monitor,  # Pass the monitor
+        )
+
+        # Run with memory monitoring
+        print("Starting batch crawl with memory monitoring...")
+        results = await dispatcher.run_urls(
+            urls=urls,
+            crawler=crawler,
+            config=crawler_config,
+        )
+        print(f"Completed {len(results)} URLs")
+
+
+async def demo_streaming_support():
+    """
+    2. Streaming Support Demo
+    ======================
+    Shows how to process URLs as they complete using streaming
+    """
+    print("\n=== 2. Streaming Support Demo ===")
+
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
+
+    # Test URLs
+    urls = ["http://example.com", "http://example.org", "http://example.net"] * 2
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Initialize dispatcher for streaming
+        dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)
+
+        print("Starting streaming crawl...")
+        async for result in dispatcher.run_urls_stream(
+            urls=urls, crawler=crawler, config=crawler_config
+        ):
+            # Process each result as it arrives
+            print(
+                f"Received result for {result.url} - Success: {result.result.success}"
+            )
+            if result.result.success:
+                print(f"Content length: {len(result.result.markdown)}")
+
+
+async def demo_content_scraping():
+    """
+    3. Content Scraping Strategy Demo
+    ==============================
+    Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.
+    """
+    print("\n=== 3. Content Scraping Strategy Demo ===")
+
+    crawler = AsyncWebCrawler()
+    url = "https://example.com/article"
+
+    # Configure with the new LXML strategy
+    config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True)
+
+    print("Scraping content with LXML strategy...")
+    async with crawler:
+        result = await crawler.arun(url, config=config)
+        if result.success:
+            print("Successfully scraped content using LXML strategy")
+
+
+async def demo_llm_markdown():
+    """
+    4. LLM-Powered Markdown Generation Demo
+    ===================================
+    Shows how to use the new LLM-powered content filtering and markdown generation.
+    """
+    print("\n=== 4. LLM-Powered Markdown Generation Demo ===")
+
+    crawler = AsyncWebCrawler()
+    url = "https://docs.python.org/3/tutorial/classes.html"
+
+    content_filter = LLMContentFilter(
+        provider="openai/gpt-4o",
+        api_token=os.getenv("OPENAI_API_KEY"),
+        instruction="""
+        Focus on extracting the core educational content about Python classes.
+        Include:
+        - Key concepts and their explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        - Version information
+        - Any non-essential UI elements
+        
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        verbose=True,
+    )
+
+    # Configure LLM-powered markdown generation
+    config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=content_filter
+        ), 
+        cache_mode = CacheMode.BYPASS,
+        verbose=True
+    )
+
+    print("Generating focused markdown with LLM...")
+    async with crawler:
+        result = await crawler.arun(url, config=config)
+        if result.success and result.markdown_v2:
+            print("Successfully generated LLM-filtered markdown")
+            print("First 500 chars of filtered content:")
+            print(result.markdown_v2.fit_markdown[:500])
+            print("Successfully generated LLM-filtered markdown")
+
+
+async def demo_robots_compliance():
+    """
+    5. Robots.txt Compliance Demo
+    ==========================
+    Demonstrates the new robots.txt compliance feature with SQLite caching.
+    """
+    print("\n=== 5. Robots.txt Compliance Demo ===")
+
+    crawler = AsyncWebCrawler()
+    urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]
+
+    # Enable robots.txt checking
+    config = CrawlerRunConfig(check_robots_txt=True, verbose=True)
+
+    print("Crawling with robots.txt compliance...")
+    async with crawler:
+        results = await crawler.arun_many(urls, config=config)
+        for result in results:
+            if result.status_code == 403:
+                print(f"Access blocked by robots.txt: {result.url}")
+            elif result.success:
+                print(f"Successfully crawled: {result.url}")
+
+
+
+async def demo_llm_schema_generation():
+    """
+    7. LLM-Powered Schema Generation Demo
+    =================================
+    Demonstrates automatic CSS and XPath schema generation using LLM models.
+    """
+    print("\n=== 7. LLM-Powered Schema Generation Demo ===")
+
+    # Example HTML content for a job listing
+    html_content = """
+    <div class="job-listing">
+        <h1 class="job-title">Senior Software Engineer</h1>
+        <div class="job-details">
+            <span class="location">San Francisco, CA</span>
+            <span class="salary">$150,000 - $200,000</span>
+            <div class="requirements">
+                <h2>Requirements</h2>
+                <ul>
+                    <li>5+ years Python experience</li>
+                    <li>Strong background in web crawling</li>
+                </ul>
+            </div>
+        </div>
+    </div>
+    """
+
+    print("Generating CSS selectors schema...")
+    # Generate CSS selectors with a specific query
+    css_schema = JsonCssExtractionStrategy.generate_schema(
+        html_content,
+        schema_type="CSS",
+        query="Extract job title, location, and salary information",
+        provider="openai/gpt-4o",  # or use other providers like "ollama"
+    )
+    print("\nGenerated CSS Schema:")
+    print(css_schema)
+
+    # Example of using the generated schema with crawler
+    crawler = AsyncWebCrawler()
+    url = "https://example.com/job-listing"
+
+    # Create an extraction strategy with the generated schema
+    extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)
+
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)
+
+    print("\nTesting generated schema with crawler...")
+    async with crawler:
+        result = await crawler.arun(url, config=config)
+        if result.success:
+            print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
+            print("Successfully used generated schema for crawling")
+
+
+async def main():
+    """Run all feature demonstrations."""
+    demo_memory_dispatcher(),
+    print("\n" + "=" * 50 + "\n")
+    demo_streaming_support(),
+    print("\n" + "=" * 50 + "\n")
+    demo_content_scraping(),
+    print("\n" + "=" * 50 + "\n")
+    demo_llm_schema_generation(),
+    print("\n" + "=" * 50 + "\n")
+    demo_llm_markdown(),
+    print("\n" + "=" * 50 + "\n")
+    demo_robots_compliance(),
+    print("\n" + "=" * 50 + "\n")
+
+if __name__ == "__main__":
+    asyncio.run(main())