feat(release): prepare v0.4.3 beta release
Prepare the v0.4.3 beta release with major feature additions and improvements: - Add JsonXPathExtractionStrategy and LLMContentFilter to exports - Update version to 0.4.3b1 - Improve documentation for dispatchers and markdown generation - Update development status to Beta - Reorganize changelog format BREAKING CHANGE: Memory threshold in MemoryAdaptiveDispatcher increased to 90% and SemaphoreDispatcher parameter renamed to max_session_permit
This commit is contained in:
135
docs/examples/scraping_strategies_performance.py
Normal file
135
docs/examples/scraping_strategies_performance.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import time, re
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
import time
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
|
||||
class TimingStats:
|
||||
def __init__(self):
|
||||
self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
|
||||
|
||||
def add(self, strategy_name, func_name, elapsed):
|
||||
self.stats[strategy_name][func_name]["calls"] += 1
|
||||
self.stats[strategy_name][func_name]["total_time"] += elapsed
|
||||
|
||||
def report(self):
|
||||
for strategy_name, funcs in self.stats.items():
|
||||
print(f"\n{strategy_name} Timing Breakdown:")
|
||||
print("-" * 60)
|
||||
print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
|
||||
print("-" * 60)
|
||||
|
||||
for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
|
||||
avg_ms = (data["total_time"] / data["calls"]) * 1000
|
||||
print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
|
||||
|
||||
timing_stats = TimingStats()
|
||||
|
||||
# Modify timing decorator
|
||||
def timing_decorator(strategy_name):
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
start = time.time()
|
||||
result = func(*args, **kwargs)
|
||||
elapsed = time.time() - start
|
||||
timing_stats.add(strategy_name, func.__name__, elapsed)
|
||||
return result
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
# Modified decorator application
|
||||
def apply_decorators(cls, method_name, strategy_name):
|
||||
try:
|
||||
original_method = getattr(cls, method_name)
|
||||
decorated_method = timing_decorator(strategy_name)(original_method)
|
||||
setattr(cls, method_name, decorated_method)
|
||||
except AttributeError:
|
||||
print(f"Method {method_name} not found in class {cls.__name__}.")
|
||||
|
||||
# Apply to key methods
|
||||
methods_to_profile = [
|
||||
'_scrap',
|
||||
# 'process_element',
|
||||
'_process_element',
|
||||
'process_image',
|
||||
]
|
||||
|
||||
|
||||
# Apply decorators to both strategies
|
||||
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
||||
for method in methods_to_profile:
|
||||
apply_decorators(strategy, method, name)
|
||||
|
||||
|
||||
def generate_large_html(n_elements=1000):
|
||||
html = ['<!DOCTYPE html><html><head></head><body>']
|
||||
for i in range(n_elements):
|
||||
html.append(f'''
|
||||
<div class="article">
|
||||
<h2>Heading {i}</h2>
|
||||
<div>
|
||||
<div>
|
||||
<p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
|
||||
</div>
|
||||
</div>
|
||||
<img src="image{i}.jpg" alt="Image {i}">
|
||||
<ul>
|
||||
<li>List item {i}.1</li>
|
||||
<li>List item {i}.2</li>
|
||||
</ul>
|
||||
</div>
|
||||
''')
|
||||
html.append('</body></html>')
|
||||
return ''.join(html)
|
||||
|
||||
def test_scraping():
|
||||
# Initialize both scrapers
|
||||
original_scraper = WebScrapingStrategy()
|
||||
selected_scraper = LXMLWebScrapingStrategy()
|
||||
|
||||
# Generate test HTML
|
||||
print("Generating HTML...")
|
||||
html = generate_large_html(5000)
|
||||
print(f"HTML Size: {len(html)/1024:.2f} KB")
|
||||
|
||||
# Time the scraping
|
||||
print("\nStarting scrape...")
|
||||
start_time = time.time()
|
||||
|
||||
kwargs = {
|
||||
"url": "http://example.com",
|
||||
"html": html,
|
||||
"word_count_threshold": 5,
|
||||
"keep_data_attributes": True
|
||||
}
|
||||
|
||||
t1 = time.perf_counter()
|
||||
result_selected = selected_scraper.scrap(**kwargs)
|
||||
t2 = time.perf_counter()
|
||||
|
||||
result_original = original_scraper.scrap(**kwargs)
|
||||
t3 = time.perf_counter()
|
||||
|
||||
elapsed = t3 - start_time
|
||||
print(f"\nScraping completed in {elapsed:.2f} seconds")
|
||||
|
||||
timing_stats.report()
|
||||
|
||||
# Print stats of LXML output
|
||||
print("\nLXML Output:")
|
||||
print(f"\nExtracted links: {len(result_selected['links']['internal']) + len(result_selected['links']['external'])}")
|
||||
print(f"Extracted images: {len(result_selected['media']['images'])}")
|
||||
print(f"Clean HTML size: {len(result_selected['cleaned_html'])/1024:.2f} KB")
|
||||
print(f"Scraping time: {t2 - t1:.2f} seconds")
|
||||
|
||||
# Print stats of original output
|
||||
print("\nOriginal Output:")
|
||||
print(f"\nExtracted links: {len(result_original['links']['internal']) + len(result_original['links']['external'])}")
|
||||
print(f"Extracted images: {len(result_original['media']['images'])}")
|
||||
print(f"Clean HTML size: {len(result_original['cleaned_html'])/1024:.2f} KB")
|
||||
print(f"Scraping time: {t3 - t1:.2f} seconds")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_scraping()
|
||||
Reference in New Issue
Block a user