Adds a new ScrapingMode enum to allow switching between BeautifulSoup and LXML parsing. LXML mode offers 10-20x better performance for large HTML documents. Key changes: - Added ScrapingMode enum with BEAUTIFULSOUP and LXML options - Implemented LXMLWebScrapingStrategy class - Added LXML-based metadata extraction - Updated documentation with scraping mode usage and performance considerations - Added cssselect dependency BREAKING CHANGE: None
52 lines
862 B
JSON
52 lines
862 B
JSON
{
|
|
"original": {
|
|
"performance": [],
|
|
"differences": []
|
|
},
|
|
"batch": {
|
|
"performance": [
|
|
{
|
|
"case": "basic",
|
|
"metrics": {
|
|
"time": 0.8874530792236328,
|
|
"memory": 98.328125
|
|
}
|
|
}
|
|
],
|
|
"differences": [
|
|
{
|
|
"case": "basic",
|
|
"differences": {
|
|
"images_count": {
|
|
"old": 50,
|
|
"new": 0,
|
|
"diff": -50
|
|
}
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"lxml": {
|
|
"performance": [
|
|
{
|
|
"case": "basic",
|
|
"metrics": {
|
|
"time": 1.210719108581543,
|
|
"memory": 99.921875
|
|
}
|
|
}
|
|
],
|
|
"differences": [
|
|
{
|
|
"case": "basic",
|
|
"differences": {
|
|
"images_count": {
|
|
"old": 50,
|
|
"new": 0,
|
|
"diff": -50
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
} |