refactor(scraping): replace ScrapingMode enum with strategy pattern

Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping.
This change introduces:
- New ContentScrapingStrategy abstract base class
- Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations
- New Pydantic models for structured scraping results
- Updated documentation reflecting the new strategy-based approach

BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
This commit is contained in:
UncleCode
2025-01-13 17:53:12 +08:00
parent f3ae5a657c
commit c3370ec5da
7 changed files with 185 additions and 70 deletions

View File

@@ -1,7 +1,8 @@
# __init__.py
from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
from .async_configs import BrowserConfig, CrawlerRunConfig
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -14,7 +15,9 @@ __all__ = [
"AsyncWebCrawler",
"CrawlResult",
"CacheMode",
"ScrapingMode",
"ContentScrapingStrategy",
"WebScrapingStrategy",
"LXMLWebScrapingStrategy",
'BrowserConfig',
'CrawlerRunConfig',
'ExtractionStrategy',

View File

@@ -9,10 +9,10 @@ from .config import (
)
from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from typing import Union, List
from enum import Enum
class BrowserConfig:
"""
@@ -184,12 +184,6 @@ class BrowserConfig:
)
class ScrapingMode(str, Enum):
"""Enum for different scraping modes."""
BEAUTIFULSOUP = "beautifulsoup"
LXML = "lxml"
class CrawlerRunConfig:
"""
Configuration class for controlling how the crawler runs each crawl operation.
@@ -227,8 +221,8 @@ class CrawlerRunConfig:
Default: False.
parser_type (str): Type of parser to use for HTML parsing.
Default: "lxml".
scraping_mode (ScrapingMode): Scraping mode to use.
Default: ScrapingMode.BEAUTIFULSOUP.
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: WebScrapingStrategy.
# Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled.
@@ -329,7 +323,7 @@ class CrawlerRunConfig:
# Content Processing Parameters
word_count_threshold: int = MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = None,
content_filter=None,
only_text: bool = False,
@@ -340,7 +334,7 @@ class CrawlerRunConfig:
remove_forms: bool = False,
prettiify: bool = False,
parser_type: str = "lxml",
scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP,
scraping_strategy: ContentScrapingStrategy = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
@@ -413,7 +407,7 @@ class CrawlerRunConfig:
self.remove_forms = remove_forms
self.prettiify = prettiify
self.parser_type = parser_type
self.scraping_mode = scraping_mode
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
# SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -480,7 +474,6 @@ class CrawlerRunConfig:
# Set default chunking strategy if None
if self.chunking_strategy is None:
from .chunking_strategy import RegexChunking
self.chunking_strategy = RegexChunking()
@staticmethod
@@ -489,7 +482,7 @@ class CrawlerRunConfig:
# Content Processing Parameters
word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy"),
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"),
only_text=kwargs.get("only_text", False),
@@ -500,7 +493,7 @@ class CrawlerRunConfig:
remove_forms=kwargs.get("remove_forms", False),
prettiify=kwargs.get("prettiify", False),
parser_type=kwargs.get("parser_type", "lxml"),
scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP),
scraping_strategy=kwargs.get("scraping_strategy"),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
@@ -574,7 +567,7 @@ class CrawlerRunConfig:
"remove_forms": self.remove_forms,
"prettiify": self.prettiify,
"parser_type": self.parser_type,
"scraping_mode": self.scraping_mode,
"scraping_strategy": self.scraping_strategy,
"fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode,
"session_id": self.session_id,

View File

@@ -543,27 +543,20 @@ class AsyncWebCrawler:
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
t1 = time.perf_counter()
# Initialize scraping strategy based on mode
if config.scraping_mode == ScrapingMode.LXML:
scrapping_strategy = LXMLWebScrapingStrategy(logger=self.logger)
else: # Default to BeautifulSoup
scrapping_strategy = WebScrapingStrategy(logger=self.logger)
# Get scraping strategy and ensure it has a logger
scraping_strategy = config.scraping_strategy
if not scraping_strategy.logger:
scraping_strategy.logger = self.logger
# Process HTML content
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
# add keys from kwargs to params that doesn't exist in params
params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
result = scrapping_strategy.scrap(
result = scraping_strategy.scrap(
url,
html,
**params,
# word_count_threshold=config.word_count_threshold,
# css_selector=config.css_selector,
# only_text=config.only_text,
# image_description_min_word_threshold=config.image_description_min_word_threshold,
# content_filter=config.content_filter,
# **kwargs
**params
)
if result is None:
@@ -576,13 +569,17 @@ class AsyncWebCrawler:
# Extract results
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
fit_html = sanitize_input_encode(result.get("fit_html", ""))
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})
# Extract results - handle both dict and ScrapingResult
if isinstance(result, dict):
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
media = result.get("media", {})
links = result.get("links", {})
metadata = result.get("metadata", {})
else:
cleaned_html = sanitize_input_encode(result.cleaned_html)
media = result.media.model_dump()
links = result.links.model_dump()
metadata = result.metadata
# Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
@@ -610,10 +607,7 @@ class AsyncWebCrawler:
)
# Handle content extraction if needed
if (extracted_content is None and
config.extraction_strategy and
config.chunking_strategy and
not isinstance(config.extraction_strategy, NoExtractionStrategy)):
if (not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)):
t1 = time.perf_counter()
@@ -664,8 +658,8 @@ class AsyncWebCrawler:
cleaned_html=cleaned_html,
markdown_v2=markdown_v2,
markdown=markdown,
fit_markdown=fit_markdown,
fit_html=fit_html,
fit_markdown=markdown_result.fit_markdown,
fit_html=markdown_result.fit_html,
media=media,
links=links,
metadata=metadata,

View File

@@ -21,6 +21,7 @@ from .utils import (
from lxml import etree
from lxml import html as lhtml
from typing import Dict, Any, List, Tuple
from .models import ScrapingResult, MediaItem, Link, Media, Links
# Pre-compile regular expressions for Open Graph and Twitter metadata
OG_REGEX = re.compile(r'^og:')
@@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url):
class ContentScrapingStrategy(ABC):
@abstractmethod
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
pass
@abstractmethod
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
pass
class WebScrapingStrategy(ContentScrapingStrategy):
@@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
log_method = getattr(self.logger, level)
log_method(message=message, tag=tag, **kwargs)
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
"""
Main entry point for content scraping.
@@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
**kwargs: Additional keyword arguments.
Returns:
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
ScrapingResult: A structured result containing the scraped content.
"""
return self._scrap(url, html, is_async=False, **kwargs)
raw_result = self._scrap(url, html, is_async=False, **kwargs)
if raw_result is None:
return ScrapingResult(
cleaned_html="",
success=False,
media=Media(),
links=Links(),
metadata={}
)
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
# Convert media items
media = Media(
images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img],
videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid],
audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud]
)
# Convert links
links = Links(
internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link],
external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link]
)
return ScrapingResult(
cleaned_html=raw_result.get("cleaned_html", ""),
success=raw_result.get("success", False),
media=media,
links=links,
metadata=raw_result.get("metadata", {})
)
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
"""
Main entry point for asynchronous content scraping.
@@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
**kwargs: Additional keyword arguments.
Returns:
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
ScrapingResult: A structured result containing the scraped content.
"""
return await asyncio.to_thread(self._scrap, url, html, **kwargs)

View File

@@ -1,5 +1,5 @@
from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple, Any
from enum import Enum
from dataclasses import dataclass, field
from .ssl_certificate import SSLCertificate
@@ -129,3 +129,38 @@ class AsyncCrawlResponse(BaseModel):
class Config:
arbitrary_types_allowed = True
###############################
# Scraping Models
###############################
class MediaItem(BaseModel):
src: str
alt: Optional[str] = None
desc: Optional[str] = None
score: int
type: str = "image"
group_id: int
format: Optional[str] = None
width: Optional[int] = None
class Link(BaseModel):
href: str
text: str
title: Optional[str] = None
base_domain: str
class Media(BaseModel):
images: List[MediaItem] = []
videos: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Video model if needed
audios: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Audio model if needed
class Links(BaseModel):
internal: List[Link] = []
external: List[Link] = []
class ScrapingResult(BaseModel):
cleaned_html: str
success: bool
media: Media = Media()
links: Links = Links()
metadata: Dict[str, Any] = {}

View File

@@ -320,14 +320,14 @@ if __name__ == "__main__":
## 6. Scraping Modes
Crawl4AI provides two different scraping modes for HTML content processing: BeautifulSoup (default) and LXML. The LXML mode offers significantly better performance, especially for large HTML documents.
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ScrapingMode
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
async def main():
config = CrawlerRunConfig(
scraping_mode=ScrapingMode.LXML # Faster alternative to default BeautifulSoup
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
@@ -336,20 +336,69 @@ async def main():
)
```
You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
```python
from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
class CustomScrapingStrategy(ContentScrapingStrategy):
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
# Implement your custom scraping logic here
return ScrapingResult(
cleaned_html="<html>...</html>", # Cleaned HTML content
success=True, # Whether scraping was successful
media=Media(
images=[ # List of images found
MediaItem(
src="https://example.com/image.jpg",
alt="Image description",
desc="Surrounding text",
score=1,
type="image",
group_id=1,
format="jpg",
width=800
)
],
videos=[], # List of videos (same structure as images)
audios=[] # List of audio files (same structure as images)
),
links=Links(
internal=[ # List of internal links
Link(
href="https://example.com/page",
text="Link text",
title="Link title",
base_domain="example.com"
)
],
external=[] # List of external links (same structure)
),
metadata={ # Additional metadata
"title": "Page Title",
"description": "Page description"
}
)
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
# For simple cases, you can use the sync version
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
```
### Performance Considerations
The LXML mode can be up to 10-20x faster than BeautifulSoup mode, particularly when processing large HTML documents. However, please note:
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
1. LXML mode is currently experimental
1. LXML strategy is currently experimental
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
Choose LXML mode when:
Choose LXML strategy when:
- Processing large HTML documents (recommended for >100KB)
- Performance is critical
- Working with well-formed HTML
Stick to BeautifulSoup mode (default) when:
Stick to BeautifulSoup strategy (default) when:
- Maximum compatibility is needed
- Working with malformed HTML
- Exact parsing behavior is critical

View File

@@ -0,0 +1,21 @@
import nest_asyncio
nest_asyncio.apply()
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode
async def main():
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
config=config
)
print(f"Success: {result.success}")
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
if __name__ == "__main__":
asyncio.run(main())