Refactor content scraping strategies: comment out WebScrapingStrategy references and update to use LXMLWebScrapingStrategy across multiple files. Bring WebScrapingStrategy methods to LXMLWebScrapingStrategy

This commit is contained in:
ntohidi
2025-05-27 17:32:45 +02:00
parent 3b766e1aac
commit bfec5156ad
7 changed files with 76 additions and 850 deletions

View File

@@ -12,10 +12,10 @@ parent_dir = os.path.dirname(
sys.path.append(parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
from crawl4ai.content_scraping_strategy import (
WebScrapingStrategy as WebScrapingStrategyCurrent,
)
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
# from crawl4ai.content_scraping_strategy import (
# WebScrapingStrategy as WebScrapingStrategyCurrent,
# )
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
@@ -32,8 +32,8 @@ class TestResult:
class StrategyTester:
def __init__(self):
self.new_scraper = WebScrapingStrategy()
self.current_scraper = WebScrapingStrategyCurrent()
self.new_scraper = LXMLWebScrapingStrategy()
self.current_scraper = LXMLWebScrapingStrategy()
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
self.WIKI_HTML = f.read()
self.results = {"new": [], "current": []}

View File

@@ -2,7 +2,6 @@ import json
import time
from bs4 import BeautifulSoup
from crawl4ai.content_scraping_strategy import (
WebScrapingStrategy,
LXMLWebScrapingStrategy,
)
from typing import Dict, List, Tuple
@@ -274,7 +273,7 @@ def get_test_scenarios():
that will be passed into scrap() for testing various features.
"""
TEST_SCENARIOS = {
# "default": {},
"default": {},
# "exclude_domains": {
# "exclude_domains": {"images.example.com", "ads.example.com"}
# },
@@ -609,19 +608,26 @@ class ScraperEquivalenceTester:
print("\n=== Testing complicated HTML with multiple parameter scenarios ===")
# Create the scrapers once (or you can re-create if needed)
original = WebScrapingStrategy()
# original = WebScrapingStrategy()
original = LXMLWebScrapingStrategy()
lxml = LXMLWebScrapingStrategy()
# Base URL for testing
url = "http://test.com"
url = "https://kidocode.com"
for scenario_name, params in get_test_scenarios().items():
print(f"\nScenario: {scenario_name}")
start = time.time()
orig_result = original.scrap("http://test.com", complicated_html, **params)
orig_result = original.scrap(url, complicated_html, **params)
orig_time = time.time() - start
orig_result = orig_result.model_dump()
start = time.time()
lxml_result = lxml.scrap("http://test.com", complicated_html, **params)
lxml_result = lxml.scrap(url, complicated_html, **params)
lxml_time = time.time() - start
lxml_result = lxml_result.model_dump()
diffs = {}
link_diff = self.deep_compare_links(