Refactor content scraping strategies: comment out WebScrapingStrategy references and update to use LXMLWebScrapingStrategy across multiple files. Bring WebScrapingStrategy methods to LXMLWebScrapingStrategy

2025-05-27 17:32:45 +02:00
parent 3b766e1aac
commit bfec5156ad
7 changed files with 76 additions and 850 deletions
--- a/tests/async/test_content_scraper_strategy.py
+++ b/tests/async/test_content_scraper_strategy.py
@@ -12,10 +12,10 @@ parent_dir = os.path.dirname(
 sys.path.append(parent_dir)
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

-from crawl4ai.content_scraping_strategy import WebScrapingStrategy
-from crawl4ai.content_scraping_strategy import (
-    WebScrapingStrategy as WebScrapingStrategyCurrent,
-)
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+# from crawl4ai.content_scraping_strategy import (
+#     WebScrapingStrategy as WebScrapingStrategyCurrent,
+# )
 # from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent


@@ -32,8 +32,8 @@ class TestResult:

 class StrategyTester:
    def __init__(self):
-        self.new_scraper = WebScrapingStrategy()
-        self.current_scraper = WebScrapingStrategyCurrent()
+        self.new_scraper = LXMLWebScrapingStrategy()
+        self.current_scraper = LXMLWebScrapingStrategy()
        with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
            self.WIKI_HTML = f.read()
        self.results = {"new": [], "current": []}
--- a/tests/async/test_evaluation_scraping_methods_performance.configs.py
+++ b/tests/async/test_evaluation_scraping_methods_performance.configs.py
@@ -2,7 +2,6 @@ import json
 import time
 from bs4 import BeautifulSoup
 from crawl4ai.content_scraping_strategy import (
-    WebScrapingStrategy,
    LXMLWebScrapingStrategy,
 )
 from typing import Dict, List, Tuple
@@ -274,7 +273,7 @@ def get_test_scenarios():
    that will be passed into scrap() for testing various features.
    """
    TEST_SCENARIOS = {
-        # "default": {},
+        "default": {},
        # "exclude_domains": {
        #     "exclude_domains": {"images.example.com", "ads.example.com"}
        # },
@@ -609,19 +608,26 @@ class ScraperEquivalenceTester:
        print("\n=== Testing complicated HTML with multiple parameter scenarios ===")

        # Create the scrapers once (or you can re-create if needed)
-        original = WebScrapingStrategy()
+        # original = WebScrapingStrategy()
+        original = LXMLWebScrapingStrategy()
        lxml = LXMLWebScrapingStrategy()
+        
+        # Base URL for testing
+        url = "http://test.com"
+        url = "https://kidocode.com"

        for scenario_name, params in get_test_scenarios().items():
            print(f"\nScenario: {scenario_name}")

            start = time.time()
-            orig_result = original.scrap("http://test.com", complicated_html, **params)
+            orig_result = original.scrap(url, complicated_html, **params)
            orig_time = time.time() - start
+            orig_result = orig_result.model_dump()

            start = time.time()
-            lxml_result = lxml.scrap("http://test.com", complicated_html, **params)
+            lxml_result = lxml.scrap(url, complicated_html, **params)
            lxml_time = time.time() - start
+            lxml_result = lxml_result.model_dump()

            diffs = {}
            link_diff = self.deep_compare_links(