commit 2def6524cdacb69c72760bf55a41089257c0bb07 Author: ntohidi <nasrin@kidocode.com> Date: Mon Aug 4 18:59:10 2025 +0800 refactor: consolidate WebScrapingStrategy to use LXML implementation only BREAKING CHANGE: None - full backward compatibility maintained This commit simplifies the content scraping architecture by removing the redundant BeautifulSoup-based WebScrapingStrategy implementation and making it an alias for LXMLWebScrapingStrategy. Changes: - Remove ~1000 lines of BeautifulSoup-based WebScrapingStrategy code - Make WebScrapingStrategy an alias for LXMLWebScrapingStrategy - Update LXMLWebScrapingStrategy to inherit directly from ContentScrapingStrategy - Add required methods (scrap, ascrap, process_element, _log) to LXMLWebScrapingStrategy - Maintain 100% backward compatibility - existing code continues to work Code changes: - crawl4ai/content_scraping_strategy.py: Remove WebScrapingStrategy class, add alias - crawl4ai/async_configs.py: Remove WebScrapingStrategy from imports - crawl4ai/__init__.py: Update imports to show alias relationship - crawl4ai/types.py: Update type definitions - crawl4ai/legacy/web_crawler.py: Update import to use alias - tests/async/test_content_scraper_strategy.py: Update to use LXMLWebScrapingStrategy - docs/examples/scraping_strategies_performance.py: Update to use single strategy Documentation updates: - docs/md_v2/core/content-selection.md: Update scraping modes section - docs/md_v2/migration/webscraping-strategy-migration.md: Add migration guide - CHANGELOG.md: Document the refactoring under [Unreleased] Benefits: - 10-20x faster HTML parsing for large documents - Reduced memory usage and simplified codebase - Consistent parsing behavior - No migration required for existing users All existing code using WebScrapingStrategy continues to work without modification, while benefiting from LXML's superior performance.
295 lines
10 KiB
Python
295 lines
10 KiB
Python
import os, time
|
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
from pathlib import Path
|
|
|
|
from .models import UrlModel, CrawlResult
|
|
from .database import init_db, get_cached_url, cache_url
|
|
from .utils import *
|
|
from .chunking_strategy import *
|
|
from .extraction_strategy import *
|
|
from .crawler_strategy import *
|
|
from typing import List
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from ..content_scraping_strategy import LXMLWebScrapingStrategy as WebScrapingStrategy
|
|
from .config import *
|
|
import warnings
|
|
import json
|
|
|
|
warnings.filterwarnings(
|
|
"ignore",
|
|
message='Field "model_name" has conflict with protected namespace "model_".',
|
|
)
|
|
|
|
|
|
class WebCrawler:
|
|
def __init__(
|
|
self,
|
|
crawler_strategy: CrawlerStrategy = None,
|
|
always_by_pass_cache: bool = False,
|
|
verbose: bool = False,
|
|
):
|
|
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(
|
|
verbose=verbose
|
|
)
|
|
self.always_by_pass_cache = always_by_pass_cache
|
|
self.crawl4ai_folder = os.path.join(
|
|
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
|
|
)
|
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
|
init_db()
|
|
self.ready = False
|
|
|
|
def warmup(self):
|
|
print("[LOG] 🌤️ Warming up the WebCrawler")
|
|
self.run(
|
|
url="https://google.com/",
|
|
word_count_threshold=5,
|
|
extraction_strategy=NoExtractionStrategy(),
|
|
bypass_cache=False,
|
|
verbose=False,
|
|
)
|
|
self.ready = True
|
|
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
|
|
|
def fetch_page(
|
|
self,
|
|
url_model: UrlModel,
|
|
provider: str = DEFAULT_PROVIDER,
|
|
api_token: str = None,
|
|
extract_blocks_flag: bool = True,
|
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
|
css_selector: str = None,
|
|
screenshot: bool = False,
|
|
use_cached_html: bool = False,
|
|
extraction_strategy: ExtractionStrategy = None,
|
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
**kwargs,
|
|
) -> CrawlResult:
|
|
return self.run(
|
|
url_model.url,
|
|
word_count_threshold,
|
|
extraction_strategy or NoExtractionStrategy(),
|
|
chunking_strategy,
|
|
bypass_cache=url_model.forced,
|
|
css_selector=css_selector,
|
|
screenshot=screenshot,
|
|
**kwargs,
|
|
)
|
|
pass
|
|
|
|
def fetch_pages(
|
|
self,
|
|
url_models: List[UrlModel],
|
|
provider: str = DEFAULT_PROVIDER,
|
|
api_token: str = None,
|
|
extract_blocks_flag: bool = True,
|
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
|
use_cached_html: bool = False,
|
|
css_selector: str = None,
|
|
screenshot: bool = False,
|
|
extraction_strategy: ExtractionStrategy = None,
|
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
**kwargs,
|
|
) -> List[CrawlResult]:
|
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
|
|
|
def fetch_page_wrapper(url_model, *args, **kwargs):
|
|
return self.fetch_page(url_model, *args, **kwargs)
|
|
|
|
with ThreadPoolExecutor() as executor:
|
|
results = list(
|
|
executor.map(
|
|
fetch_page_wrapper,
|
|
url_models,
|
|
[provider] * len(url_models),
|
|
[api_token] * len(url_models),
|
|
[extract_blocks_flag] * len(url_models),
|
|
[word_count_threshold] * len(url_models),
|
|
[css_selector] * len(url_models),
|
|
[screenshot] * len(url_models),
|
|
[use_cached_html] * len(url_models),
|
|
[extraction_strategy] * len(url_models),
|
|
[chunking_strategy] * len(url_models),
|
|
*[kwargs] * len(url_models),
|
|
)
|
|
)
|
|
|
|
return results
|
|
|
|
def run(
|
|
self,
|
|
url: str,
|
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
|
extraction_strategy: ExtractionStrategy = None,
|
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
bypass_cache: bool = False,
|
|
css_selector: str = None,
|
|
screenshot: bool = False,
|
|
user_agent: str = None,
|
|
verbose=True,
|
|
**kwargs,
|
|
) -> CrawlResult:
|
|
try:
|
|
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
|
extraction_strategy.verbose = verbose
|
|
if not isinstance(extraction_strategy, ExtractionStrategy):
|
|
raise ValueError("Unsupported extraction strategy")
|
|
if not isinstance(chunking_strategy, ChunkingStrategy):
|
|
raise ValueError("Unsupported chunking strategy")
|
|
|
|
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
|
|
|
cached = None
|
|
screenshot_data = None
|
|
extracted_content = None
|
|
if not bypass_cache and not self.always_by_pass_cache:
|
|
cached = get_cached_url(url)
|
|
|
|
if kwargs.get("warmup", True) and not self.ready:
|
|
return None
|
|
|
|
if cached:
|
|
html = sanitize_input_encode(cached[1])
|
|
extracted_content = sanitize_input_encode(cached[4])
|
|
if screenshot:
|
|
screenshot_data = cached[9]
|
|
if not screenshot_data:
|
|
cached = None
|
|
|
|
if not cached or not html:
|
|
if user_agent:
|
|
self.crawler_strategy.update_user_agent(user_agent)
|
|
t1 = time.time()
|
|
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
|
|
t2 = time.time()
|
|
if verbose:
|
|
print(
|
|
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
|
)
|
|
if screenshot:
|
|
screenshot_data = self.crawler_strategy.take_screenshot()
|
|
|
|
crawl_result = self.process_html(
|
|
url,
|
|
html,
|
|
extracted_content,
|
|
word_count_threshold,
|
|
extraction_strategy,
|
|
chunking_strategy,
|
|
css_selector,
|
|
screenshot_data,
|
|
verbose,
|
|
bool(cached),
|
|
**kwargs,
|
|
)
|
|
crawl_result.success = bool(html)
|
|
return crawl_result
|
|
except Exception as e:
|
|
if not hasattr(e, "msg"):
|
|
e.msg = str(e)
|
|
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
|
|
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
|
|
|
|
def process_html(
|
|
self,
|
|
url: str,
|
|
html: str,
|
|
extracted_content: str,
|
|
word_count_threshold: int,
|
|
extraction_strategy: ExtractionStrategy,
|
|
chunking_strategy: ChunkingStrategy,
|
|
css_selector: str,
|
|
screenshot: bool,
|
|
verbose: bool,
|
|
is_cached: bool,
|
|
**kwargs,
|
|
) -> CrawlResult:
|
|
t = time.time()
|
|
# Extract content from HTML
|
|
try:
|
|
t1 = time.time()
|
|
scrapping_strategy = WebScrapingStrategy()
|
|
extra_params = {
|
|
k: v
|
|
for k, v in kwargs.items()
|
|
if k not in ["only_text", "image_description_min_word_threshold"]
|
|
}
|
|
result = scrapping_strategy.scrap(
|
|
url,
|
|
html,
|
|
word_count_threshold=word_count_threshold,
|
|
css_selector=css_selector,
|
|
only_text=kwargs.get("only_text", False),
|
|
image_description_min_word_threshold=kwargs.get(
|
|
"image_description_min_word_threshold",
|
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
|
),
|
|
**extra_params,
|
|
)
|
|
|
|
# result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
|
if verbose:
|
|
print(
|
|
f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
|
|
)
|
|
|
|
if result is None:
|
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
|
except InvalidCSSSelectorError as e:
|
|
raise ValueError(str(e))
|
|
|
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
|
markdown = sanitize_input_encode(result.get("markdown", ""))
|
|
media = result.get("media", [])
|
|
links = result.get("links", [])
|
|
metadata = result.get("metadata", {})
|
|
|
|
if extracted_content is None:
|
|
if verbose:
|
|
print(
|
|
f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}"
|
|
)
|
|
|
|
sections = chunking_strategy.chunk(markdown)
|
|
extracted_content = extraction_strategy.run(url, sections)
|
|
extracted_content = json.dumps(
|
|
extracted_content, indent=4, default=str, ensure_ascii=False
|
|
)
|
|
|
|
if verbose:
|
|
print(
|
|
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
|
|
)
|
|
|
|
screenshot = None if not screenshot else screenshot
|
|
|
|
if not is_cached:
|
|
cache_url(
|
|
url,
|
|
html,
|
|
cleaned_html,
|
|
markdown,
|
|
extracted_content,
|
|
True,
|
|
json.dumps(media),
|
|
json.dumps(links),
|
|
json.dumps(metadata),
|
|
screenshot=screenshot,
|
|
)
|
|
|
|
return CrawlResult(
|
|
url=url,
|
|
html=html,
|
|
cleaned_html=format_html(cleaned_html),
|
|
markdown=markdown,
|
|
media=media,
|
|
links=links,
|
|
metadata=metadata,
|
|
screenshot=screenshot,
|
|
extracted_content=extracted_content,
|
|
success=True,
|
|
error_message="",
|
|
)
|