commit 2def6524cdacb69c72760bf55a41089257c0bb07 Author: ntohidi <nasrin@kidocode.com> Date: Mon Aug 4 18:59:10 2025 +0800 refactor: consolidate WebScrapingStrategy to use LXML implementation only BREAKING CHANGE: None - full backward compatibility maintained This commit simplifies the content scraping architecture by removing the redundant BeautifulSoup-based WebScrapingStrategy implementation and making it an alias for LXMLWebScrapingStrategy. Changes: - Remove ~1000 lines of BeautifulSoup-based WebScrapingStrategy code - Make WebScrapingStrategy an alias for LXMLWebScrapingStrategy - Update LXMLWebScrapingStrategy to inherit directly from ContentScrapingStrategy - Add required methods (scrap, ascrap, process_element, _log) to LXMLWebScrapingStrategy - Maintain 100% backward compatibility - existing code continues to work Code changes: - crawl4ai/content_scraping_strategy.py: Remove WebScrapingStrategy class, add alias - crawl4ai/async_configs.py: Remove WebScrapingStrategy from imports - crawl4ai/__init__.py: Update imports to show alias relationship - crawl4ai/types.py: Update type definitions - crawl4ai/legacy/web_crawler.py: Update import to use alias - tests/async/test_content_scraper_strategy.py: Update to use LXMLWebScrapingStrategy - docs/examples/scraping_strategies_performance.py: Update to use single strategy Documentation updates: - docs/md_v2/core/content-selection.md: Update scraping modes section - docs/md_v2/migration/webscraping-strategy-migration.md: Add migration guide - CHANGELOG.md: Document the refactoring under [Unreleased] Benefits: - 10-20x faster HTML parsing for large documents - Reduced memory usage and simplified codebase - Consistent parsing behavior - No migration required for existing users All existing code using WebScrapingStrategy continues to work without modification, while benefiting from LXML's superior performance.
136 lines
4.6 KiB
Python
136 lines
4.6 KiB
Python
import time, re
|
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
|
# WebScrapingStrategy is now an alias for LXMLWebScrapingStrategy
|
|
import time
|
|
import functools
|
|
from collections import defaultdict
|
|
|
|
class TimingStats:
|
|
def __init__(self):
|
|
self.stats = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "total_time": 0}))
|
|
|
|
def add(self, strategy_name, func_name, elapsed):
|
|
self.stats[strategy_name][func_name]["calls"] += 1
|
|
self.stats[strategy_name][func_name]["total_time"] += elapsed
|
|
|
|
def report(self):
|
|
for strategy_name, funcs in self.stats.items():
|
|
print(f"\n{strategy_name} Timing Breakdown:")
|
|
print("-" * 60)
|
|
print(f"{'Function':<30} {'Calls':<10} {'Total(s)':<10} {'Avg(ms)':<10}")
|
|
print("-" * 60)
|
|
|
|
for func, data in sorted(funcs.items(), key=lambda x: x[1]["total_time"], reverse=True):
|
|
avg_ms = (data["total_time"] / data["calls"]) * 1000
|
|
print(f"{func:<30} {data['calls']:<10} {data['total_time']:<10.3f} {avg_ms:<10.2f}")
|
|
|
|
timing_stats = TimingStats()
|
|
|
|
# Modify timing decorator
|
|
def timing_decorator(strategy_name):
|
|
def decorator(func):
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
start = time.time()
|
|
result = func(*args, **kwargs)
|
|
elapsed = time.time() - start
|
|
timing_stats.add(strategy_name, func.__name__, elapsed)
|
|
return result
|
|
return wrapper
|
|
return decorator
|
|
|
|
# Modified decorator application
|
|
def apply_decorators(cls, method_name, strategy_name):
|
|
try:
|
|
original_method = getattr(cls, method_name)
|
|
decorated_method = timing_decorator(strategy_name)(original_method)
|
|
setattr(cls, method_name, decorated_method)
|
|
except AttributeError:
|
|
print(f"Method {method_name} not found in class {cls.__name__}.")
|
|
|
|
# Apply to key methods
|
|
methods_to_profile = [
|
|
'_scrap',
|
|
# 'process_element',
|
|
'_process_element',
|
|
'process_image',
|
|
]
|
|
|
|
|
|
# Apply decorators to both strategies
|
|
for strategy, name in [(LXMLWebScrapingStrategy, "LXML")]:
|
|
for method in methods_to_profile:
|
|
apply_decorators(strategy, method, name)
|
|
|
|
|
|
def generate_large_html(n_elements=1000):
|
|
html = ['<!DOCTYPE html><html><head></head><body>']
|
|
for i in range(n_elements):
|
|
html.append(f'''
|
|
<div class="article">
|
|
<h2>Heading {i}</h2>
|
|
<div>
|
|
<div>
|
|
<p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
|
|
</div>
|
|
</div>
|
|
<img src="image{i}.jpg" alt="Image {i}">
|
|
<ul>
|
|
<li>List item {i}.1</li>
|
|
<li>List item {i}.2</li>
|
|
</ul>
|
|
</div>
|
|
''')
|
|
html.append('</body></html>')
|
|
return ''.join(html)
|
|
|
|
def test_scraping():
|
|
# Initialize both scrapers
|
|
original_scraper = LXMLWebScrapingStrategy()
|
|
selected_scraper = LXMLWebScrapingStrategy()
|
|
|
|
# Generate test HTML
|
|
print("Generating HTML...")
|
|
html = generate_large_html(5000)
|
|
print(f"HTML Size: {len(html)/1024:.2f} KB")
|
|
|
|
# Time the scraping
|
|
print("\nStarting scrape...")
|
|
start_time = time.time()
|
|
|
|
kwargs = {
|
|
"url": "http://example.com",
|
|
"html": html,
|
|
"word_count_threshold": 5,
|
|
"keep_data_attributes": True
|
|
}
|
|
|
|
t1 = time.perf_counter()
|
|
result_selected = selected_scraper.scrap(**kwargs)
|
|
t2 = time.perf_counter()
|
|
|
|
result_original = original_scraper.scrap(**kwargs)
|
|
t3 = time.perf_counter()
|
|
|
|
elapsed = t3 - start_time
|
|
print(f"\nScraping completed in {elapsed:.2f} seconds")
|
|
|
|
timing_stats.report()
|
|
|
|
# Print stats of LXML output
|
|
print("\Turbo Output:")
|
|
print(f"\nExtracted links: {len(result_selected.links.internal) + len(result_selected.links.external)}")
|
|
print(f"Extracted images: {len(result_selected.media.images)}")
|
|
print(f"Clean HTML size: {len(result_selected.cleaned_html)/1024:.2f} KB")
|
|
print(f"Scraping time: {t2 - t1:.2f} seconds")
|
|
|
|
# Print stats of original output
|
|
print("\nOriginal Output:")
|
|
print(f"\nExtracted links: {len(result_original.links.internal) + len(result_original.links.external)}")
|
|
print(f"Extracted images: {len(result_original.media.images)}")
|
|
print(f"Clean HTML size: {len(result_original.cleaned_html)/1024:.2f} KB")
|
|
print(f"Scraping time: {t3 - t1:.2f} seconds")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_scraping() |