Apply Ruff Corrections
This commit is contained in:
@@ -1,22 +1,24 @@
|
||||
import asyncio
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, Any
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import csv
|
||||
from tabulate import tabulate
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict
|
||||
from typing import List
|
||||
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
parent_dir = os.path.dirname(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
)
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
from crawl4ai.content_scraping_strategy import (
|
||||
WebScrapingStrategy as WebScrapingStrategyCurrent,
|
||||
)
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
name: str
|
||||
@@ -27,69 +29,71 @@ class TestResult:
|
||||
markdown_length: int
|
||||
execution_time: float
|
||||
|
||||
|
||||
class StrategyTester:
|
||||
def __init__(self):
|
||||
self.new_scraper = WebScrapingStrategy()
|
||||
self.current_scraper = WebScrapingStrategyCurrent()
|
||||
with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f:
|
||||
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
|
||||
self.WIKI_HTML = f.read()
|
||||
self.results = {'new': [], 'current': []}
|
||||
|
||||
self.results = {"new": [], "current": []}
|
||||
|
||||
def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]:
|
||||
results = []
|
||||
for scraper in [self.new_scraper, self.current_scraper]:
|
||||
start_time = time.time()
|
||||
result = scraper._get_content_of_website_optimized(
|
||||
url="https://en.wikipedia.org/wiki/Test",
|
||||
html=self.WIKI_HTML,
|
||||
**kwargs
|
||||
url="https://en.wikipedia.org/wiki/Test", html=self.WIKI_HTML, **kwargs
|
||||
)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
|
||||
test_result = TestResult(
|
||||
name=name,
|
||||
success=result['success'],
|
||||
images=len(result['media']['images']),
|
||||
internal_links=len(result['links']['internal']),
|
||||
external_links=len(result['links']['external']),
|
||||
markdown_length=len(result['markdown']),
|
||||
execution_time=execution_time
|
||||
success=result["success"],
|
||||
images=len(result["media"]["images"]),
|
||||
internal_links=len(result["links"]["internal"]),
|
||||
external_links=len(result["links"]["external"]),
|
||||
markdown_length=len(result["markdown"]),
|
||||
execution_time=execution_time,
|
||||
)
|
||||
results.append(test_result)
|
||||
|
||||
|
||||
return results[0], results[1] # new, current
|
||||
|
||||
def run_all_tests(self):
|
||||
test_cases = [
|
||||
("Basic Extraction", {}),
|
||||
("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}),
|
||||
("Word Threshold", {'word_count_threshold': 50}),
|
||||
("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}),
|
||||
("Link Exclusions", {
|
||||
'exclude_external_links': True,
|
||||
'exclude_social_media_links': True,
|
||||
'exclude_domains': ['facebook.com', 'twitter.com']
|
||||
}),
|
||||
("Media Handling", {
|
||||
'exclude_external_images': True,
|
||||
'image_description_min_word_threshold': 20
|
||||
}),
|
||||
("Text Only", {
|
||||
'only_text': True,
|
||||
'remove_forms': True
|
||||
}),
|
||||
("HTML Cleaning", {
|
||||
'clean_html': True,
|
||||
'keep_data_attributes': True
|
||||
}),
|
||||
("HTML2Text Options", {
|
||||
'html2text': {
|
||||
'skip_internal_links': True,
|
||||
'single_line_break': True,
|
||||
'mark_code': True,
|
||||
'preserve_tags': ['pre', 'code']
|
||||
}
|
||||
})
|
||||
("Exclude Tags", {"excluded_tags": ["table", "div.infobox", "div.navbox"]}),
|
||||
("Word Threshold", {"word_count_threshold": 50}),
|
||||
("CSS Selector", {"css_selector": "div.mw-parser-output > p"}),
|
||||
(
|
||||
"Link Exclusions",
|
||||
{
|
||||
"exclude_external_links": True,
|
||||
"exclude_social_media_links": True,
|
||||
"exclude_domains": ["facebook.com", "twitter.com"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"Media Handling",
|
||||
{
|
||||
"exclude_external_images": True,
|
||||
"image_description_min_word_threshold": 20,
|
||||
},
|
||||
),
|
||||
("Text Only", {"only_text": True, "remove_forms": True}),
|
||||
("HTML Cleaning", {"clean_html": True, "keep_data_attributes": True}),
|
||||
(
|
||||
"HTML2Text Options",
|
||||
{
|
||||
"html2text": {
|
||||
"skip_internal_links": True,
|
||||
"single_line_break": True,
|
||||
"mark_code": True,
|
||||
"preserve_tags": ["pre", "code"],
|
||||
}
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
all_results = []
|
||||
@@ -99,64 +103,117 @@ class StrategyTester:
|
||||
all_results.append((name, new_result, current_result))
|
||||
except Exception as e:
|
||||
print(f"Error in {name}: {str(e)}")
|
||||
|
||||
|
||||
self.save_results_to_csv(all_results)
|
||||
self.print_comparison_table(all_results)
|
||||
|
||||
def save_results_to_csv(self, all_results: List[tuple]):
|
||||
csv_file = os.path.join(__location__, 'strategy_comparison_results.csv')
|
||||
with open(csv_file, 'w', newline='') as f:
|
||||
csv_file = os.path.join(__location__, "strategy_comparison_results.csv")
|
||||
with open(csv_file, "w", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links',
|
||||
'External Links', 'Markdown Length', 'Execution Time'])
|
||||
|
||||
writer.writerow(
|
||||
[
|
||||
"Test Name",
|
||||
"Strategy",
|
||||
"Success",
|
||||
"Images",
|
||||
"Internal Links",
|
||||
"External Links",
|
||||
"Markdown Length",
|
||||
"Execution Time",
|
||||
]
|
||||
)
|
||||
|
||||
for name, new_result, current_result in all_results:
|
||||
writer.writerow([name, 'New', new_result.success, new_result.images,
|
||||
new_result.internal_links, new_result.external_links,
|
||||
new_result.markdown_length, f"{new_result.execution_time:.3f}"])
|
||||
writer.writerow([name, 'Current', current_result.success, current_result.images,
|
||||
current_result.internal_links, current_result.external_links,
|
||||
current_result.markdown_length, f"{current_result.execution_time:.3f}"])
|
||||
writer.writerow(
|
||||
[
|
||||
name,
|
||||
"New",
|
||||
new_result.success,
|
||||
new_result.images,
|
||||
new_result.internal_links,
|
||||
new_result.external_links,
|
||||
new_result.markdown_length,
|
||||
f"{new_result.execution_time:.3f}",
|
||||
]
|
||||
)
|
||||
writer.writerow(
|
||||
[
|
||||
name,
|
||||
"Current",
|
||||
current_result.success,
|
||||
current_result.images,
|
||||
current_result.internal_links,
|
||||
current_result.external_links,
|
||||
current_result.markdown_length,
|
||||
f"{current_result.execution_time:.3f}",
|
||||
]
|
||||
)
|
||||
|
||||
def print_comparison_table(self, all_results: List[tuple]):
|
||||
table_data = []
|
||||
headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links',
|
||||
'External Links', 'Markdown Length', 'Time (s)']
|
||||
headers = [
|
||||
"Test Name",
|
||||
"Strategy",
|
||||
"Success",
|
||||
"Images",
|
||||
"Internal Links",
|
||||
"External Links",
|
||||
"Markdown Length",
|
||||
"Time (s)",
|
||||
]
|
||||
|
||||
for name, new_result, current_result in all_results:
|
||||
# Check for differences
|
||||
differences = []
|
||||
if new_result.images != current_result.images: differences.append('images')
|
||||
if new_result.internal_links != current_result.internal_links: differences.append('internal_links')
|
||||
if new_result.external_links != current_result.external_links: differences.append('external_links')
|
||||
if new_result.markdown_length != current_result.markdown_length: differences.append('markdown')
|
||||
|
||||
if new_result.images != current_result.images:
|
||||
differences.append("images")
|
||||
if new_result.internal_links != current_result.internal_links:
|
||||
differences.append("internal_links")
|
||||
if new_result.external_links != current_result.external_links:
|
||||
differences.append("external_links")
|
||||
if new_result.markdown_length != current_result.markdown_length:
|
||||
differences.append("markdown")
|
||||
|
||||
# Add row for new strategy
|
||||
new_row = [
|
||||
name, 'New', new_result.success, new_result.images,
|
||||
new_result.internal_links, new_result.external_links,
|
||||
new_result.markdown_length, f"{new_result.execution_time:.3f}"
|
||||
name,
|
||||
"New",
|
||||
new_result.success,
|
||||
new_result.images,
|
||||
new_result.internal_links,
|
||||
new_result.external_links,
|
||||
new_result.markdown_length,
|
||||
f"{new_result.execution_time:.3f}",
|
||||
]
|
||||
table_data.append(new_row)
|
||||
|
||||
|
||||
# Add row for current strategy
|
||||
current_row = [
|
||||
'', 'Current', current_result.success, current_result.images,
|
||||
current_result.internal_links, current_result.external_links,
|
||||
current_result.markdown_length, f"{current_result.execution_time:.3f}"
|
||||
"",
|
||||
"Current",
|
||||
current_result.success,
|
||||
current_result.images,
|
||||
current_result.internal_links,
|
||||
current_result.external_links,
|
||||
current_result.markdown_length,
|
||||
f"{current_result.execution_time:.3f}",
|
||||
]
|
||||
table_data.append(current_row)
|
||||
|
||||
|
||||
# Add difference summary if any
|
||||
if differences:
|
||||
table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', ''])
|
||||
|
||||
table_data.append(
|
||||
["", "⚠️ Differences", ", ".join(differences), "", "", "", "", ""]
|
||||
)
|
||||
|
||||
# Add empty row for better readability
|
||||
table_data.append([''] * len(headers))
|
||||
table_data.append([""] * len(headers))
|
||||
|
||||
print("\nStrategy Comparison Results:")
|
||||
print(tabulate(table_data, headers=headers, tablefmt='grid'))
|
||||
print(tabulate(table_data, headers=headers, tablefmt="grid"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tester = StrategyTester()
|
||||
tester.run_all_tests()
|
||||
tester.run_all_tests()
|
||||
|
||||
Reference in New Issue
Block a user