feat(scraping): add LXML-based scraping mode for improved performance

Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing. LXML mode offers 10-20x better performance for large HTML documents. Key changes: - Added ScrapingMode enum with BEAUTIFULSOUP and LXML options - Implemented LXMLWebScrapingStrategy class - Added LXML-based metadata extraction - Updated documentation with scraping mode usage and performance considerations - Added cssselect dependency BREAKING CHANGE: None
2025-01-12 20:46:23 +08:00
parent 825c78a048
commit f3ae5a657c
12 changed files with 1366 additions and 509 deletions
--- a/scraper_evaluation.json
+++ b/scraper_evaluation.json
@@ -0,0 +1,52 @@
+{
+  "original": {
+    "performance": [],
+    "differences": []
+  },
+  "batch": {
+    "performance": [
+      {
+        "case": "basic",
+        "metrics": {
+          "time": 0.8874530792236328,
+          "memory": 98.328125
+        }
+      }
+    ],
+    "differences": [
+      {
+        "case": "basic",
+        "differences": {
+          "images_count": {
+            "old": 50,
+            "new": 0,
+            "diff": -50
+          }
+        }
+      }
+    ]
+  },
+  "lxml": {
+    "performance": [
+      {
+        "case": "basic",
+        "metrics": {
+          "time": 1.210719108581543,
+          "memory": 99.921875
+        }
+      }
+    ],
+    "differences": [
+      {
+        "case": "basic",
+        "differences": {
+          "images_count": {
+            "old": 50,
+            "new": 0,
+            "diff": -50
+          }
+        }
+      }
+    ]
+  }
+}