feat(scraping): add LXML-based scraping mode for improved performance

Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing.
LXML mode offers 10-20x better performance for large HTML documents.

Key changes:
- Added ScrapingMode enum with BEAUTIFULSOUP and LXML options
- Implemented LXMLWebScrapingStrategy class
- Added LXML-based metadata extraction
- Updated documentation with scraping mode usage and performance considerations
- Added cssselect dependency

BREAKING CHANGE: None
This commit is contained in:
UncleCode
2025-01-12 20:46:23 +08:00
parent 825c78a048
commit f3ae5a657c
12 changed files with 1366 additions and 509 deletions

View File

@@ -12,6 +12,7 @@ from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy
from .markdown_generation_strategy import MarkdownGenerationStrategy
from typing import Union, List
from enum import Enum
class BrowserConfig:
"""
@@ -183,6 +184,12 @@ class BrowserConfig:
)
class ScrapingMode(str, Enum):
"""Enum for different scraping modes."""
BEAUTIFULSOUP = "beautifulsoup"
LXML = "lxml"
class CrawlerRunConfig:
"""
Configuration class for controlling how the crawler runs each crawl operation.
@@ -220,6 +227,8 @@ class CrawlerRunConfig:
Default: False.
parser_type (str): Type of parser to use for HTML parsing.
Default: "lxml".
scraping_mode (ScrapingMode): Scraping mode to use.
Default: ScrapingMode.BEAUTIFULSOUP.
# Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled.
@@ -331,6 +340,7 @@ class CrawlerRunConfig:
remove_forms: bool = False,
prettiify: bool = False,
parser_type: str = "lxml",
scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP,
# SSL Parameters
fetch_ssl_certificate: bool = False,
@@ -403,6 +413,7 @@ class CrawlerRunConfig:
self.remove_forms = remove_forms
self.prettiify = prettiify
self.parser_type = parser_type
self.scraping_mode = scraping_mode
# SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -489,6 +500,7 @@ class CrawlerRunConfig:
remove_forms=kwargs.get("remove_forms", False),
prettiify=kwargs.get("prettiify", False),
parser_type=kwargs.get("parser_type", "lxml"),
scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
@@ -562,6 +574,7 @@ class CrawlerRunConfig:
"remove_forms": self.remove_forms,
"prettiify": self.prettiify,
"parser_type": self.parser_type,
"scraping_mode": self.scraping_mode,
"fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode,
"session_id": self.session_id,