Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -1,11 +1,15 @@
import json
import time
from bs4 import BeautifulSoup
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
from typing import Dict, Any, List, Tuple
from crawl4ai.content_scraping_strategy import (
WebScrapingStrategy,
LXMLWebScrapingStrategy,
)
from typing import Dict, List, Tuple
import difflib
from lxml import html as lhtml, etree
def normalize_dom(element):
"""
Recursively normalizes an lxml HTML element:
@@ -15,7 +19,7 @@ def normalize_dom(element):
Returns the same element (mutated).
"""
# Remove comment nodes
comments = element.xpath('//comment()')
comments = element.xpath("//comment()")
for c in comments:
p = c.getparent()
if p is not None:
@@ -45,7 +49,7 @@ def strip_html_body(root):
"""
If 'root' is <html>, find its <body> child and move all of <body>'s children
into a new <div>. Return that <div>.
If 'root' is <body>, similarly move all of its children into a new <div> and return it.
Otherwise, return 'root' as-is.
@@ -53,8 +57,8 @@ def strip_html_body(root):
tag_name = (root.tag or "").lower()
# Case 1: The root is <html>
if tag_name == 'html':
bodies = root.xpath('./body')
if tag_name == "html":
bodies = root.xpath("./body")
if bodies:
body = bodies[0]
new_div = lhtml.Element("div")
@@ -66,7 +70,7 @@ def strip_html_body(root):
return root
# Case 2: The root is <body>
elif tag_name == 'body':
elif tag_name == "body":
new_div = lhtml.Element("div")
for child in root:
new_div.append(child)
@@ -92,7 +96,9 @@ def compare_nodes(node1, node2, differences, path="/"):
attrs1 = list(node1.attrib.items())
attrs2 = list(node2.attrib.items())
if attrs1 != attrs2:
differences.append(f"Attribute mismatch at {path}/{node1.tag}: {attrs1} vs. {attrs2}")
differences.append(
f"Attribute mismatch at {path}/{node1.tag}: {attrs1} vs. {attrs2}"
)
# 3) Compare text (trim or unify whitespace as needed)
text1 = (node1.text or "").strip()
@@ -102,7 +108,9 @@ def compare_nodes(node1, node2, differences, path="/"):
text2 = " ".join(text2.split())
if text1 != text2:
# If you prefer ignoring newlines or multiple whitespace, do a more robust cleanup
differences.append(f"Text mismatch at {path}/{node1.tag}: '{text1}' vs. '{text2}'")
differences.append(
f"Text mismatch at {path}/{node1.tag}: '{text1}' vs. '{text2}'"
)
# 4) Compare number of children
children1 = list(node1)
@@ -123,7 +131,9 @@ def compare_nodes(node1, node2, differences, path="/"):
tail1 = (node1.tail or "").strip()
tail2 = (node2.tail or "").strip()
if tail1 != tail2:
differences.append(f"Tail mismatch after {path}/{node1.tag}: '{tail1}' vs. '{tail2}'")
differences.append(
f"Tail mismatch after {path}/{node1.tag}: '{tail1}' vs. '{tail2}'"
)
def compare_html_structurally(html1, html2):
@@ -156,11 +166,11 @@ def compare_html_structurally(html1, html2):
return differences
def generate_large_html(n_elements=1000):
html = ['<!DOCTYPE html><html><head></head><body>']
html = ["<!DOCTYPE html><html><head></head><body>"]
for i in range(n_elements):
html.append(f'''
html.append(
f"""
<div class="article">
<h2>Heading {i}</h2>
<p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
@@ -170,13 +180,15 @@ def generate_large_html(n_elements=1000):
<li>List item {i}.2</li>
</ul>
</div>
''')
html.append('</body></html>')
return ''.join(html)
"""
)
html.append("</body></html>")
return "".join(html)
def generate_complicated_html():
"""
HTML with multiple domains, forms, data attributes,
HTML with multiple domains, forms, data attributes,
various images, comments, style, and noscript to test all parameter toggles.
"""
return """
@@ -258,7 +270,7 @@ def generate_complicated_html():
def get_test_scenarios():
"""
Returns a dictionary of parameter sets (test scenarios) for the scraper.
Each scenario name maps to a dictionary of keyword arguments
Each scenario name maps to a dictionary of keyword arguments
that will be passed into scrap() for testing various features.
"""
TEST_SCENARIOS = {
@@ -341,7 +353,7 @@ def get_test_scenarios():
# "exclude_external_links": True
# },
# "comprehensive_removal": {
# # Exclude multiple tags, remove forms & comments,
# # Exclude multiple tags, remove forms & comments,
# # and also remove targeted selectors
# "excluded_tags": ["aside", "noscript", "script"],
# "excluded_selector": "#promo-section, .social-widget",
@@ -352,19 +364,18 @@ def get_test_scenarios():
return TEST_SCENARIOS
class ScraperEquivalenceTester:
def __init__(self):
self.test_cases = {
'basic': self.generate_basic_html(),
'complex': self.generate_complex_html(),
'malformed': self.generate_malformed_html(),
"basic": self.generate_basic_html(),
"complex": self.generate_complex_html(),
"malformed": self.generate_malformed_html(),
# 'real_world': self.load_real_samples()
}
def generate_basic_html(self):
return generate_large_html(1000) # Your existing function
def generate_complex_html(self):
return """
<html><body>
@@ -384,7 +395,7 @@ class ScraperEquivalenceTester:
</div>
</body></html>
"""
def generate_malformed_html(self):
return """
<div>Unclosed div
@@ -395,139 +406,139 @@ class ScraperEquivalenceTester:
<!-- Malformed comment -- > -->
<![CDATA[Test CDATA]]>
"""
def load_real_samples(self):
# Load some real-world HTML samples you've collected
samples = {
'article': open('tests/samples/article.html').read(),
'product': open('tests/samples/product.html').read(),
'blog': open('tests/samples/blog.html').read()
"article": open("tests/samples/article.html").read(),
"product": open("tests/samples/product.html").read(),
"blog": open("tests/samples/blog.html").read(),
}
return samples
def deep_compare_links(self, old_links: Dict, new_links: Dict) -> List[str]:
"""Detailed comparison of link structures"""
differences = []
for category in ['internal', 'external']:
old_urls = {link['href'] for link in old_links[category]}
new_urls = {link['href'] for link in new_links[category]}
for category in ["internal", "external"]:
old_urls = {link["href"] for link in old_links[category]}
new_urls = {link["href"] for link in new_links[category]}
missing = old_urls - new_urls
extra = new_urls - old_urls
if missing:
differences.append(f"Missing {category} links: {missing}")
if extra:
differences.append(f"Extra {category} links: {extra}")
# Compare link attributes for common URLs
common = old_urls & new_urls
for url in common:
old_link = next(l for l in old_links[category] if l['href'] == url)
new_link = next(l for l in new_links[category] if l['href'] == url)
for attr in ['text', 'title']:
old_link = next(l for l in old_links[category] if l["href"] == url)
new_link = next(l for l in new_links[category] if l["href"] == url)
for attr in ["text", "title"]:
if old_link[attr] != new_link[attr]:
differences.append(
f"Link attribute mismatch for {url} - {attr}:"
f" old='{old_link[attr]}' vs new='{new_link[attr]}'"
)
return differences
def deep_compare_media(self, old_media: Dict, new_media: Dict) -> List[str]:
"""Detailed comparison of media elements"""
differences = []
for media_type in ['images', 'videos', 'audios']:
old_srcs = {item['src'] for item in old_media[media_type]}
new_srcs = {item['src'] for item in new_media[media_type]}
for media_type in ["images", "videos", "audios"]:
old_srcs = {item["src"] for item in old_media[media_type]}
new_srcs = {item["src"] for item in new_media[media_type]}
missing = old_srcs - new_srcs
extra = new_srcs - old_srcs
if missing:
differences.append(f"Missing {media_type}: {missing}")
if extra:
differences.append(f"Extra {media_type}: {extra}")
# Compare media attributes for common sources
common = old_srcs & new_srcs
for src in common:
old_item = next(m for m in old_media[media_type] if m['src'] == src)
new_item = next(m for m in new_media[media_type] if m['src'] == src)
for attr in ['alt', 'description']:
old_item = next(m for m in old_media[media_type] if m["src"] == src)
new_item = next(m for m in new_media[media_type] if m["src"] == src)
for attr in ["alt", "description"]:
if old_item.get(attr) != new_item.get(attr):
differences.append(
f"{media_type} attribute mismatch for {src} - {attr}:"
f" old='{old_item.get(attr)}' vs new='{new_item.get(attr)}'"
)
return differences
def compare_html_content(self, old_html: str, new_html: str) -> List[str]:
"""Compare HTML content structure and text"""
# return compare_html_structurally(old_html, new_html)
differences = []
def normalize_html(html: str) -> Tuple[str, str]:
soup = BeautifulSoup(html, 'lxml')
soup = BeautifulSoup(html, "lxml")
# Get both structure and text
structure = ' '.join(tag.name for tag in soup.find_all())
text = ' '.join(soup.get_text().split())
structure = " ".join(tag.name for tag in soup.find_all())
text = " ".join(soup.get_text().split())
return structure, text
old_structure, old_text = normalize_html(old_html)
new_structure, new_text = normalize_html(new_html)
# Compare structure
if abs(len(old_structure) - len(new_structure)) > 100:
# if old_structure != new_structure:
# if old_structure != new_structure:
diff = difflib.unified_diff(
old_structure.split(),
new_structure.split(),
lineterm=''
old_structure.split(), new_structure.split(), lineterm=""
)
differences.append("HTML structure differences:\n" + '\n'.join(diff))
differences.append("HTML structure differences:\n" + "\n".join(diff))
# Compare text content
if abs(len(old_text) - len(new_text)) > 100:
# if old_text != new_text:
# if old_text != new_text:
# Show detailed text differences
text_diff = difflib.unified_diff(
old_text.split(),
new_text.split(),
lineterm=''
old_text.split(), new_text.split(), lineterm=""
)
differences.append("Text content differences:\n" + '\n'.join(text_diff))
differences.append("Text content differences:\n" + "\n".join(text_diff))
return differences
def compare_results(self, old_result: Dict, new_result: Dict) -> Dict[str, List[str]]:
def compare_results(
self, old_result: Dict, new_result: Dict
) -> Dict[str, List[str]]:
"""Comprehensive comparison of scraper outputs"""
differences = {}
# Compare links
link_differences = self.deep_compare_links(old_result['links'], new_result['links'])
link_differences = self.deep_compare_links(
old_result["links"], new_result["links"]
)
if link_differences:
differences['links'] = link_differences
differences["links"] = link_differences
# Compare media
media_differences = self.deep_compare_media(old_result['media'], new_result['media'])
media_differences = self.deep_compare_media(
old_result["media"], new_result["media"]
)
if media_differences:
differences['media'] = media_differences
differences["media"] = media_differences
# Compare HTML
html_differences = self.compare_html_content(
old_result['cleaned_html'],
new_result['cleaned_html']
old_result["cleaned_html"], new_result["cleaned_html"]
)
if html_differences:
differences['html'] = html_differences
differences["html"] = html_differences
return differences
def run_tests(self) -> Dict:
@@ -535,52 +546,49 @@ class ScraperEquivalenceTester:
# We'll still keep some "test_cases" logic from above (basic, complex, malformed).
# But we add a new section for the complicated HTML scenarios.
results = {
'tests': [],
'summary': {'passed': 0, 'failed': 0}
}
results = {"tests": [], "summary": {"passed": 0, "failed": 0}}
# 1) First, run the existing 3 built-in test cases (basic, complex, malformed).
# for case_name, html in self.test_cases.items():
# print(f"\nTesting built-in case: {case_name}...")
# original = WebScrapingStrategy()
# lxml = LXMLWebScrapingStrategy()
# start = time.time()
# orig_result = original.scrap("http://test.com", html)
# orig_time = time.time() - start
# print("\nOriginal Mode:")
# print(f"Cleaned HTML size: {len(orig_result['cleaned_html'])/1024:.2f} KB")
# print(f"Images: {len(orig_result['media']['images'])}")
# print(f"External links: {len(orig_result['links']['external'])}")
# print(f"Times - Original: {orig_time:.3f}s")
# start = time.time()
# lxml_result = lxml.scrap("http://test.com", html)
# lxml_time = time.time() - start
# print("\nLXML Mode:")
# print(f"Cleaned HTML size: {len(lxml_result['cleaned_html'])/1024:.2f} KB")
# print(f"Images: {len(lxml_result['media']['images'])}")
# print(f"External links: {len(lxml_result['links']['external'])}")
# print(f"Times - LXML: {lxml_time:.3f}s")
# # Compare
# diffs = {}
# link_diff = self.deep_compare_links(orig_result['links'], lxml_result['links'])
# if link_diff:
# diffs['links'] = link_diff
# media_diff = self.deep_compare_media(orig_result['media'], lxml_result['media'])
# if media_diff:
# diffs['media'] = media_diff
# html_diff = self.compare_html_content(orig_result['cleaned_html'], lxml_result['cleaned_html'])
# if html_diff:
# diffs['html'] = html_diff
# test_result = {
# 'case': case_name,
# 'lxml_mode': {
@@ -590,7 +598,7 @@ class ScraperEquivalenceTester:
# 'original_time': orig_time
# }
# results['tests'].append(test_result)
# if not diffs:
# results['summary']['passed'] += 1
# else:
@@ -599,50 +607,55 @@ class ScraperEquivalenceTester:
# 2) Now, run the complicated HTML with multiple parameter scenarios.
complicated_html = generate_complicated_html()
print("\n=== Testing complicated HTML with multiple parameter scenarios ===")
# Create the scrapers once (or you can re-create if needed)
original = WebScrapingStrategy()
lxml = LXMLWebScrapingStrategy()
for scenario_name, params in get_test_scenarios().items():
print(f"\nScenario: {scenario_name}")
start = time.time()
orig_result = original.scrap("http://test.com", complicated_html, **params)
orig_time = time.time() - start
start = time.time()
lxml_result = lxml.scrap("http://test.com", complicated_html, **params)
lxml_time = time.time() - start
diffs = {}
link_diff = self.deep_compare_links(orig_result['links'], lxml_result['links'])
link_diff = self.deep_compare_links(
orig_result["links"], lxml_result["links"]
)
if link_diff:
diffs['links'] = link_diff
diffs["links"] = link_diff
media_diff = self.deep_compare_media(orig_result['media'], lxml_result['media'])
media_diff = self.deep_compare_media(
orig_result["media"], lxml_result["media"]
)
if media_diff:
diffs['media'] = media_diff
diffs["media"] = media_diff
html_diff = self.compare_html_content(orig_result['cleaned_html'], lxml_result['cleaned_html'])
html_diff = self.compare_html_content(
orig_result["cleaned_html"], lxml_result["cleaned_html"]
)
if html_diff:
diffs['html'] = html_diff
diffs["html"] = html_diff
test_result = {
'case': f"complicated_{scenario_name}",
'lxml_mode': {
'differences': diffs,
'execution_time': lxml_time
},
'original_time': orig_time
"case": f"complicated_{scenario_name}",
"lxml_mode": {"differences": diffs, "execution_time": lxml_time},
"original_time": orig_time,
}
results['tests'].append(test_result)
results["tests"].append(test_result)
if not diffs:
results['summary']['passed'] += 1
print(f"✅ [OK] No differences found. Time(Orig: {orig_time:.3f}s, LXML: {lxml_time:.3f}s)")
results["summary"]["passed"] += 1
print(
f"✅ [OK] No differences found. Time(Orig: {orig_time:.3f}s, LXML: {lxml_time:.3f}s)"
)
else:
results['summary']['failed'] += 1
results["summary"]["failed"] += 1
print("❌ Differences found:")
for category, dlist in diffs.items():
print(f" {category}:")
@@ -657,20 +670,22 @@ class ScraperEquivalenceTester:
print(f"Total Cases: {len(results['tests'])}")
print(f"Passed: {results['summary']['passed']}")
print(f"Failed: {results['summary']['failed']}")
for test in results['tests']:
for test in results["tests"]:
print(f"\nTest Case: {test['case']}")
if not test['lxml_mode']['differences']:
if not test["lxml_mode"]["differences"]:
print("✅ All implementations produced identical results")
print(f"Times - Original: {test['original_time']:.3f}s, "
f"LXML: {test['lxml_mode']['execution_time']:.3f}s")
print(
f"Times - Original: {test['original_time']:.3f}s, "
f"LXML: {test['lxml_mode']['execution_time']:.3f}s"
)
else:
print("❌ Differences found:")
if test['lxml_mode']['differences']:
if test["lxml_mode"]["differences"]:
print("\nLXML Mode Differences:")
for category, diffs in test['lxml_mode']['differences'].items():
for category, diffs in test["lxml_mode"]["differences"].items():
print(f"\n{category}:")
for diff in diffs:
print(f" - {diff}")
@@ -680,11 +695,11 @@ def main():
tester = ScraperEquivalenceTester()
results = tester.run_tests()
tester.print_report(results)
# Save detailed results for debugging
with open('scraper_equivalence_results.json', 'w') as f:
with open("scraper_equivalence_results.json", "w") as f:
json.dump(results, f, indent=2)
if __name__ == "__main__":
main()
main()