feat(release): prepare v0.4.3 beta release

Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit
2025-01-21 21:03:11 +08:00
parent d09c611d15
commit 16b8d4945b
12 changed files with 885 additions and 287 deletions
--- a/docs/examples/scraping_strategies_performance.py
+++ b/docs/examples/scraping_strategies_performance.py
@@ -0,0 +1,135 @@
+import time, re
+from crawl4ai.content_scraping_strategy import WebScrapingStrategy,  LXMLWebScrapingStrategy
+import time
+import functools
+from collections import defaultdict
+
+class TimingStats:
+    def __init__(self):
+        self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
+        
+    def add(self, strategy_name, func_name, elapsed):
+        self.stats[strategy_name][func_name]["calls"] += 1
+        self.stats[strategy_name][func_name]["total_time"] += elapsed
+        
+    def report(self):
+        for strategy_name, funcs in self.stats.items():
+            print(f"\n{strategy_name} Timing Breakdown:")
+            print("-" * 60)
+            print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
+            print("-" * 60)
+            
+            for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
+                avg_ms = (data["total_time"] / data["calls"]) * 1000
+                print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
+
+timing_stats = TimingStats()
+
+# Modify timing decorator
+def timing_decorator(strategy_name):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            start = time.time()
+            result = func(*args, **kwargs)
+            elapsed = time.time() - start
+            timing_stats.add(strategy_name, func.__name__, elapsed)
+            return result
+        return wrapper
+    return decorator
+
+# Modified decorator application
+def apply_decorators(cls, method_name, strategy_name):
+    try:
+        original_method = getattr(cls, method_name)
+        decorated_method = timing_decorator(strategy_name)(original_method)
+        setattr(cls, method_name, decorated_method)
+    except AttributeError:
+        print(f"Method {method_name} not found in class {cls.__name__}.")
+
+# Apply to key methods
+methods_to_profile = [
+    '_scrap',
+    # 'process_element', 
+    '_process_element', 
+    'process_image',
+]
+
+
+# Apply decorators to both strategies
+for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
+    for method in methods_to_profile:
+        apply_decorators(strategy, method, name)
+
+
+def generate_large_html(n_elements=1000):
+    html = ['<!DOCTYPE html><html><head></head><body>']
+    for i in range(n_elements):
+        html.append(f'''
+            <div class="article">
+                <h2>Heading {i}</h2>
+                <div>
+                    <div>
+                        <p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
+                    </div>
+                </div>
+                <img src="image{i}.jpg" alt="Image {i}">
+                <ul>
+                    <li>List item {i}.1</li>
+                    <li>List item {i}.2</li>
+                </ul>
+            </div>
+        ''')
+    html.append('</body></html>')
+    return ''.join(html)
+
+def test_scraping():
+    # Initialize both scrapers
+    original_scraper = WebScrapingStrategy()
+    selected_scraper = LXMLWebScrapingStrategy()
+    
+    # Generate test HTML
+    print("Generating HTML...")
+    html = generate_large_html(5000)
+    print(f"HTML Size: {len(html)/1024:.2f} KB")
+    
+    # Time the scraping
+    print("\nStarting scrape...")
+    start_time = time.time()
+    
+    kwargs = {
+        "url": "http://example.com",
+        "html": html,
+        "word_count_threshold": 5,
+        "keep_data_attributes": True
+    }
+    
+    t1 = time.perf_counter()
+    result_selected = selected_scraper.scrap(**kwargs)
+    t2 = time.perf_counter()
+    
+    result_original = original_scraper.scrap(**kwargs)
+    t3 = time.perf_counter()
+    
+    elapsed = t3 - start_time
+    print(f"\nScraping completed in {elapsed:.2f} seconds")
+    
+    timing_stats.report()
+    
+    # Print stats of LXML output
+    print("\nLXML Output:")
+    print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}")
+    print(f"Extracted images: {len(result_selected['media']['images'])}")
+    print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB")
+    print(f"Scraping time: {t2 - t1:.2f} seconds")
+
+    # Print stats of original output
+    print("\nOriginal Output:")
+    print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}")
+    print(f"Extracted images: {len(result_original['media']['images'])}")
+    print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB")
+    print(f"Scraping time: {t3 - t1:.2f} seconds")
+        
+        
+if __name__ == "__main__":
+    test_scraping()