refactor(scraping): replace ScrapingMode enum with strategy pattern

Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping.
This change introduces:
- New ContentScrapingStrategy abstract base class
- Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations
- New Pydantic models for structured scraping results
- Updated documentation reflecting the new strategy-based approach

BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
This commit is contained in:
UncleCode
2025-01-13 17:53:12 +08:00
parent f3ae5a657c
commit c3370ec5da
7 changed files with 185 additions and 70 deletions

View File

@@ -1,7 +1,8 @@
# __init__.py # __init__.py
from .async_webcrawler import AsyncWebCrawler, CacheMode from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode from .async_configs import BrowserConfig, CrawlerRunConfig
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -14,7 +15,9 @@ __all__ = [
"AsyncWebCrawler", "AsyncWebCrawler",
"CrawlResult", "CrawlResult",
"CacheMode", "CacheMode",
"ScrapingMode", "ContentScrapingStrategy",
"WebScrapingStrategy",
"LXMLWebScrapingStrategy",
'BrowserConfig', 'BrowserConfig',
'CrawlerRunConfig', 'CrawlerRunConfig',
'ExtractionStrategy', 'ExtractionStrategy',

View File

@@ -9,10 +9,10 @@ from .config import (
) )
from .user_agent_generator import UserAgentGenerator from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from typing import Union, List from typing import Union, List
from enum import Enum
class BrowserConfig: class BrowserConfig:
""" """
@@ -184,12 +184,6 @@ class BrowserConfig:
) )
class ScrapingMode(str, Enum):
"""Enum for different scraping modes."""
BEAUTIFULSOUP = "beautifulsoup"
LXML = "lxml"
class CrawlerRunConfig: class CrawlerRunConfig:
""" """
Configuration class for controlling how the crawler runs each crawl operation. Configuration class for controlling how the crawler runs each crawl operation.
@@ -227,8 +221,8 @@ class CrawlerRunConfig:
Default: False. Default: False.
parser_type (str): Type of parser to use for HTML parsing. parser_type (str): Type of parser to use for HTML parsing.
Default: "lxml". Default: "lxml".
scraping_mode (ScrapingMode): Scraping mode to use. scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: ScrapingMode.BEAUTIFULSOUP. Default: WebScrapingStrategy.
# Caching Parameters # Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled. cache_mode (CacheMode or None): Defines how caching is handled.
@@ -329,7 +323,7 @@ class CrawlerRunConfig:
# Content Processing Parameters # Content Processing Parameters
word_count_threshold: int = MIN_WORD_THRESHOLD, word_count_threshold: int = MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None, extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = None, markdown_generator: MarkdownGenerationStrategy = None,
content_filter=None, content_filter=None,
only_text: bool = False, only_text: bool = False,
@@ -340,7 +334,7 @@ class CrawlerRunConfig:
remove_forms: bool = False, remove_forms: bool = False,
prettiify: bool = False, prettiify: bool = False,
parser_type: str = "lxml", parser_type: str = "lxml",
scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP, scraping_strategy: ContentScrapingStrategy = None,
# SSL Parameters # SSL Parameters
fetch_ssl_certificate: bool = False, fetch_ssl_certificate: bool = False,
@@ -413,7 +407,7 @@ class CrawlerRunConfig:
self.remove_forms = remove_forms self.remove_forms = remove_forms
self.prettiify = prettiify self.prettiify = prettiify
self.parser_type = parser_type self.parser_type = parser_type
self.scraping_mode = scraping_mode self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
# SSL Parameters # SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -480,7 +474,6 @@ class CrawlerRunConfig:
# Set default chunking strategy if None # Set default chunking strategy if None
if self.chunking_strategy is None: if self.chunking_strategy is None:
from .chunking_strategy import RegexChunking
self.chunking_strategy = RegexChunking() self.chunking_strategy = RegexChunking()
@staticmethod @staticmethod
@@ -489,7 +482,7 @@ class CrawlerRunConfig:
# Content Processing Parameters # Content Processing Parameters
word_count_threshold=kwargs.get("word_count_threshold", 200), word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"), extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy"), chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
markdown_generator=kwargs.get("markdown_generator"), markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"), content_filter=kwargs.get("content_filter"),
only_text=kwargs.get("only_text", False), only_text=kwargs.get("only_text", False),
@@ -500,7 +493,7 @@ class CrawlerRunConfig:
remove_forms=kwargs.get("remove_forms", False), remove_forms=kwargs.get("remove_forms", False),
prettiify=kwargs.get("prettiify", False), prettiify=kwargs.get("prettiify", False),
parser_type=kwargs.get("parser_type", "lxml"), parser_type=kwargs.get("parser_type", "lxml"),
scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP), scraping_strategy=kwargs.get("scraping_strategy"),
# SSL Parameters # SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
@@ -574,7 +567,7 @@ class CrawlerRunConfig:
"remove_forms": self.remove_forms, "remove_forms": self.remove_forms,
"prettiify": self.prettiify, "prettiify": self.prettiify,
"parser_type": self.parser_type, "parser_type": self.parser_type,
"scraping_mode": self.scraping_mode, "scraping_strategy": self.scraping_strategy,
"fetch_ssl_certificate": self.fetch_ssl_certificate, "fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode, "cache_mode": self.cache_mode,
"session_id": self.session_id, "session_id": self.session_id,

View File

@@ -543,27 +543,20 @@ class AsyncWebCrawler:
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
t1 = time.perf_counter() t1 = time.perf_counter()
# Initialize scraping strategy based on mode # Get scraping strategy and ensure it has a logger
if config.scraping_mode == ScrapingMode.LXML: scraping_strategy = config.scraping_strategy
scrapping_strategy = LXMLWebScrapingStrategy(logger=self.logger) if not scraping_strategy.logger:
else: # Default to BeautifulSoup scraping_strategy.logger = self.logger
scrapping_strategy = WebScrapingStrategy(logger=self.logger)
# Process HTML content # Process HTML content
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
# add keys from kwargs to params that doesn't exist in params # add keys from kwargs to params that doesn't exist in params
params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
result = scrapping_strategy.scrap( result = scraping_strategy.scrap(
url, url,
html, html,
**params, **params
# word_count_threshold=config.word_count_threshold,
# css_selector=config.css_selector,
# only_text=config.only_text,
# image_description_min_word_threshold=config.image_description_min_word_threshold,
# content_filter=config.content_filter,
# **kwargs
) )
if result is None: if result is None:
@@ -576,13 +569,17 @@ class AsyncWebCrawler:
# Extract results # Extract results - handle both dict and ScrapingResult
cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) if isinstance(result, dict):
fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
fit_html = sanitize_input_encode(result.get("fit_html", "")) media = result.get("media", {})
media = result.get("media", []) links = result.get("links", {})
links = result.get("links", []) metadata = result.get("metadata", {})
metadata = result.get("metadata", {}) else:
cleaned_html = sanitize_input_encode(result.cleaned_html)
media = result.media.model_dump()
links = result.links.model_dump()
metadata = result.metadata
# Markdown Generation # Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
@@ -610,10 +607,7 @@ class AsyncWebCrawler:
) )
# Handle content extraction if needed # Handle content extraction if needed
if (extracted_content is None and if (not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)):
config.extraction_strategy and
config.chunking_strategy and
not isinstance(config.extraction_strategy, NoExtractionStrategy)):
t1 = time.perf_counter() t1 = time.perf_counter()
@@ -664,8 +658,8 @@ class AsyncWebCrawler:
cleaned_html=cleaned_html, cleaned_html=cleaned_html,
markdown_v2=markdown_v2, markdown_v2=markdown_v2,
markdown=markdown, markdown=markdown,
fit_markdown=fit_markdown, fit_markdown=markdown_result.fit_markdown,
fit_html=fit_html, fit_html=markdown_result.fit_html,
media=media, media=media,
links=links, links=links,
metadata=metadata, metadata=metadata,

View File

@@ -21,6 +21,7 @@ from .utils import (
from lxml import etree from lxml import etree
from lxml import html as lhtml from lxml import html as lhtml
from typing import Dict, Any, List, Tuple from typing import Dict, Any, List, Tuple
from .models import ScrapingResult, MediaItem, Link, Media, Links
# Pre-compile regular expressions for Open Graph and Twitter metadata # Pre-compile regular expressions for Open Graph and Twitter metadata
OG_REGEX = re.compile(r'^og:') OG_REGEX = re.compile(r'^og:')
@@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url):
class ContentScrapingStrategy(ABC): class ContentScrapingStrategy(ABC):
@abstractmethod @abstractmethod
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
pass pass
@abstractmethod @abstractmethod
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
pass pass
class WebScrapingStrategy(ContentScrapingStrategy): class WebScrapingStrategy(ContentScrapingStrategy):
@@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
log_method = getattr(self.logger, level) log_method = getattr(self.logger, level)
log_method(message=message, tag=tag, **kwargs) log_method(message=message, tag=tag, **kwargs)
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
""" """
Main entry point for content scraping. Main entry point for content scraping.
@@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
**kwargs: Additional keyword arguments. **kwargs: Additional keyword arguments.
Returns: Returns:
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: ScrapingResult: A structured result containing the scraped content.
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
""" """
return self._scrap(url, html, is_async=False, **kwargs) raw_result = self._scrap(url, html, is_async=False, **kwargs)
if raw_result is None:
return ScrapingResult(
cleaned_html="",
success=False,
media=Media(),
links=Links(),
metadata={}
)
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: # Convert media items
media = Media(
images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img],
videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid],
audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud]
)
# Convert links
links = Links(
internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link],
external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link]
)
return ScrapingResult(
cleaned_html=raw_result.get("cleaned_html", ""),
success=raw_result.get("success", False),
media=media,
links=links,
metadata=raw_result.get("metadata", {})
)
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
""" """
Main entry point for asynchronous content scraping. Main entry point for asynchronous content scraping.
@@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
**kwargs: Additional keyword arguments. **kwargs: Additional keyword arguments.
Returns: Returns:
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: ScrapingResult: A structured result containing the scraped content.
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
""" """
return await asyncio.to_thread(self._scrap, url, html, **kwargs) return await asyncio.to_thread(self._scrap, url, html, **kwargs)

View File

@@ -1,5 +1,5 @@
from pydantic import BaseModel, HttpUrl from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple, Any
from enum import Enum from enum import Enum
from dataclasses import dataclass, field from dataclasses import dataclass, field
from .ssl_certificate import SSLCertificate from .ssl_certificate import SSLCertificate
@@ -129,3 +129,38 @@ class AsyncCrawlResponse(BaseModel):
class Config: class Config:
arbitrary_types_allowed = True arbitrary_types_allowed = True
###############################
# Scraping Models
###############################
class MediaItem(BaseModel):
src: str
alt: Optional[str] = None
desc: Optional[str] = None
score: int
type: str = "image"
group_id: int
format: Optional[str] = None
width: Optional[int] = None
class Link(BaseModel):
href: str
text: str
title: Optional[str] = None
base_domain: str
class Media(BaseModel):
images: List[MediaItem] = []
videos: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Video model if needed
audios: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Audio model if needed
class Links(BaseModel):
internal: List[Link] = []
external: List[Link] = []
class ScrapingResult(BaseModel):
cleaned_html: str
success: bool
media: Media = Media()
links: Links = Links()
metadata: Dict[str, Any] = {}

View File

@@ -320,14 +320,14 @@ if __name__ == "__main__":
## 6. Scraping Modes ## 6. Scraping Modes
Crawl4AI provides two different scraping modes for HTML content processing: BeautifulSoup (default) and LXML. The LXML mode offers significantly better performance, especially for large HTML documents. Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
```python ```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ScrapingMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
async def main(): async def main():
config = CrawlerRunConfig( config = CrawlerRunConfig(
scraping_mode=ScrapingMode.LXML # Faster alternative to default BeautifulSoup scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
) )
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( result = await crawler.arun(
@@ -336,20 +336,69 @@ async def main():
) )
``` ```
You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
```python
from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
class CustomScrapingStrategy(ContentScrapingStrategy):
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
# Implement your custom scraping logic here
return ScrapingResult(
cleaned_html="<html>...</html>", # Cleaned HTML content
success=True, # Whether scraping was successful
media=Media(
images=[ # List of images found
MediaItem(
src="https://example.com/image.jpg",
alt="Image description",
desc="Surrounding text",
score=1,
type="image",
group_id=1,
format="jpg",
width=800
)
],
videos=[], # List of videos (same structure as images)
audios=[] # List of audio files (same structure as images)
),
links=Links(
internal=[ # List of internal links
Link(
href="https://example.com/page",
text="Link text",
title="Link title",
base_domain="example.com"
)
],
external=[] # List of external links (same structure)
),
metadata={ # Additional metadata
"title": "Page Title",
"description": "Page description"
}
)
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
# For simple cases, you can use the sync version
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
```
### Performance Considerations ### Performance Considerations
The LXML mode can be up to 10-20x faster than BeautifulSoup mode, particularly when processing large HTML documents. However, please note: The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
1. LXML mode is currently experimental 1. LXML strategy is currently experimental
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup 2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example 3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
Choose LXML mode when: Choose LXML strategy when:
- Processing large HTML documents (recommended for >100KB) - Processing large HTML documents (recommended for >100KB)
- Performance is critical - Performance is critical
- Working with well-formed HTML - Working with well-formed HTML
Stick to BeautifulSoup mode (default) when: Stick to BeautifulSoup strategy (default) when:
- Maximum compatibility is needed - Maximum compatibility is needed
- Working with malformed HTML - Working with malformed HTML
- Exact parsing behavior is critical - Exact parsing behavior is critical

View File

@@ -0,0 +1,21 @@
import nest_asyncio
nest_asyncio.apply()
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode
async def main():
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
config=config
)
print(f"Success: {result.success}")
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
if __name__ == "__main__":
asyncio.run(main())