feat(scraping): add LXML-based scraping mode for improved performance
Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing. LXML mode offers 10-20x better performance for large HTML documents. Key changes: - Added ScrapingMode enum with BEAUTIFULSOUP and LXML options - Implemented LXMLWebScrapingStrategy class - Added LXML-based metadata extraction - Updated documentation with scraping mode usage and performance considerations - Added cssselect dependency BREAKING CHANGE: None
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# __init__.py
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
|
||||
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
@@ -14,6 +14,7 @@ __all__ = [
|
||||
"AsyncWebCrawler",
|
||||
"CrawlResult",
|
||||
"CacheMode",
|
||||
"ScrapingMode",
|
||||
'BrowserConfig',
|
||||
'CrawlerRunConfig',
|
||||
'ExtractionStrategy',
|
||||
|
||||
Reference in New Issue
Block a user