perf(crawler): major performance improvements & raw HTML support

- Switch to lxml parser (~4x speedup)
- Add raw HTML & local file crawling support
- Fix cache headers & async cleanup
- Add browser process monitoring
- Optimize BeautifulSoup operations
- Pre-compile regex patterns

Breaking: Raw HTML handling requires new URL prefixes
Fixes: #256, #253
This commit is contained in:
UncleCode
2024-11-13 19:40:40 +08:00
parent 61b93ebf36
commit c38ac29edb
11 changed files with 2953 additions and 130 deletions

View File

@@ -10,7 +10,7 @@ from .extraction_strategy import *
from .crawler_strategy import *
from typing import List
from concurrent.futures import ThreadPoolExecutor
from .content_scrapping_strategy import WebScrappingStrategy
from .content_scrapping_strategy import WebScrapingStrategy
from .config import *
import warnings
import json
@@ -182,7 +182,7 @@ class WebCrawler:
# Extract content from HTML
try:
t1 = time.time()
scrapping_strategy = WebScrappingStrategy()
scrapping_strategy = WebScrapingStrategy()
extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
result = scrapping_strategy.scrap(
url,