refactor(scraping): replace ScrapingMode enum with strategy pattern
Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping. This change introduces: - New ContentScrapingStrategy abstract base class - Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations - New Pydantic models for structured scraping results - Updated documentation reflecting the new strategy-based approach BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
# __init__.py
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
@@ -14,7 +15,9 @@ __all__ = [
|
||||
"AsyncWebCrawler",
|
||||
"CrawlResult",
|
||||
"CacheMode",
|
||||
"ScrapingMode",
|
||||
"ContentScrapingStrategy",
|
||||
"WebScrapingStrategy",
|
||||
"LXMLWebScrapingStrategy",
|
||||
'BrowserConfig',
|
||||
'CrawlerRunConfig',
|
||||
'ExtractionStrategy',
|
||||
|
||||
@@ -9,10 +9,10 @@ from .config import (
|
||||
)
|
||||
from .user_agent_generator import UserAgentGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from typing import Union, List
|
||||
from enum import Enum
|
||||
|
||||
class BrowserConfig:
|
||||
"""
|
||||
@@ -184,12 +184,6 @@ class BrowserConfig:
|
||||
)
|
||||
|
||||
|
||||
class ScrapingMode(str, Enum):
|
||||
"""Enum for different scraping modes."""
|
||||
BEAUTIFULSOUP = "beautifulsoup"
|
||||
LXML = "lxml"
|
||||
|
||||
|
||||
class CrawlerRunConfig:
|
||||
"""
|
||||
Configuration class for controlling how the crawler runs each crawl operation.
|
||||
@@ -227,8 +221,8 @@ class CrawlerRunConfig:
|
||||
Default: False.
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
scraping_mode (ScrapingMode): Scraping mode to use.
|
||||
Default: ScrapingMode.BEAUTIFULSOUP.
|
||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||
Default: WebScrapingStrategy.
|
||||
|
||||
# Caching Parameters
|
||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||
@@ -329,7 +323,7 @@ class CrawlerRunConfig:
|
||||
# Content Processing Parameters
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
markdown_generator: MarkdownGenerationStrategy = None,
|
||||
content_filter=None,
|
||||
only_text: bool = False,
|
||||
@@ -340,7 +334,7 @@ class CrawlerRunConfig:
|
||||
remove_forms: bool = False,
|
||||
prettiify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP,
|
||||
scraping_strategy: ContentScrapingStrategy = None,
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
@@ -413,7 +407,7 @@ class CrawlerRunConfig:
|
||||
self.remove_forms = remove_forms
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
self.scraping_mode = scraping_mode
|
||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||
|
||||
# SSL Parameters
|
||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||
@@ -480,7 +474,6 @@ class CrawlerRunConfig:
|
||||
|
||||
# Set default chunking strategy if None
|
||||
if self.chunking_strategy is None:
|
||||
from .chunking_strategy import RegexChunking
|
||||
self.chunking_strategy = RegexChunking()
|
||||
|
||||
@staticmethod
|
||||
@@ -489,7 +482,7 @@ class CrawlerRunConfig:
|
||||
# Content Processing Parameters
|
||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||
chunking_strategy=kwargs.get("chunking_strategy"),
|
||||
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
||||
markdown_generator=kwargs.get("markdown_generator"),
|
||||
content_filter=kwargs.get("content_filter"),
|
||||
only_text=kwargs.get("only_text", False),
|
||||
@@ -500,7 +493,7 @@ class CrawlerRunConfig:
|
||||
remove_forms=kwargs.get("remove_forms", False),
|
||||
prettiify=kwargs.get("prettiify", False),
|
||||
parser_type=kwargs.get("parser_type", "lxml"),
|
||||
scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP),
|
||||
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||
@@ -574,7 +567,7 @@ class CrawlerRunConfig:
|
||||
"remove_forms": self.remove_forms,
|
||||
"prettiify": self.prettiify,
|
||||
"parser_type": self.parser_type,
|
||||
"scraping_mode": self.scraping_mode,
|
||||
"scraping_strategy": self.scraping_strategy,
|
||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||
"cache_mode": self.cache_mode,
|
||||
"session_id": self.session_id,
|
||||
|
||||
@@ -543,27 +543,20 @@ class AsyncWebCrawler:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
t1 = time.perf_counter()
|
||||
|
||||
# Initialize scraping strategy based on mode
|
||||
if config.scraping_mode == ScrapingMode.LXML:
|
||||
scrapping_strategy = LXMLWebScrapingStrategy(logger=self.logger)
|
||||
else: # Default to BeautifulSoup
|
||||
scrapping_strategy = WebScrapingStrategy(logger=self.logger)
|
||||
# Get scraping strategy and ensure it has a logger
|
||||
scraping_strategy = config.scraping_strategy
|
||||
if not scraping_strategy.logger:
|
||||
scraping_strategy.logger = self.logger
|
||||
|
||||
# Process HTML content
|
||||
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
|
||||
# add keys from kwargs to params that doesn't exist in params
|
||||
params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
|
||||
|
||||
result = scrapping_strategy.scrap(
|
||||
result = scraping_strategy.scrap(
|
||||
url,
|
||||
html,
|
||||
**params,
|
||||
# word_count_threshold=config.word_count_threshold,
|
||||
# css_selector=config.css_selector,
|
||||
# only_text=config.only_text,
|
||||
# image_description_min_word_threshold=config.image_description_min_word_threshold,
|
||||
# content_filter=config.content_filter,
|
||||
# **kwargs
|
||||
**params
|
||||
)
|
||||
|
||||
if result is None:
|
||||
@@ -576,13 +569,17 @@ class AsyncWebCrawler:
|
||||
|
||||
|
||||
|
||||
# Extract results
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
||||
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
# Extract results - handle both dict and ScrapingResult
|
||||
if isinstance(result, dict):
|
||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||
media = result.get("media", {})
|
||||
links = result.get("links", {})
|
||||
metadata = result.get("metadata", {})
|
||||
else:
|
||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||
media = result.media.model_dump()
|
||||
links = result.links.model_dump()
|
||||
metadata = result.metadata
|
||||
|
||||
# Markdown Generation
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
||||
@@ -610,10 +607,7 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# Handle content extraction if needed
|
||||
if (extracted_content is None and
|
||||
config.extraction_strategy and
|
||||
config.chunking_strategy and
|
||||
not isinstance(config.extraction_strategy, NoExtractionStrategy)):
|
||||
if (not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)):
|
||||
|
||||
t1 = time.perf_counter()
|
||||
|
||||
@@ -664,8 +658,8 @@ class AsyncWebCrawler:
|
||||
cleaned_html=cleaned_html,
|
||||
markdown_v2=markdown_v2,
|
||||
markdown=markdown,
|
||||
fit_markdown=fit_markdown,
|
||||
fit_html=fit_html,
|
||||
fit_markdown=markdown_result.fit_markdown,
|
||||
fit_html=markdown_result.fit_html,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
|
||||
@@ -21,6 +21,7 @@ from .utils import (
|
||||
from lxml import etree
|
||||
from lxml import html as lhtml
|
||||
from typing import Dict, Any, List, Tuple
|
||||
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
||||
|
||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||
OG_REGEX = re.compile(r'^og:')
|
||||
@@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url):
|
||||
|
||||
class ContentScrapingStrategy(ABC):
|
||||
@abstractmethod
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
pass
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
@@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
log_method = getattr(self.logger, level)
|
||||
log_method(message=message, tag=tag, **kwargs)
|
||||
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
"""
|
||||
Main entry point for content scraping.
|
||||
|
||||
@@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
return self._scrap(url, html, is_async=False, **kwargs)
|
||||
raw_result = self._scrap(url, html, is_async=False, **kwargs)
|
||||
if raw_result is None:
|
||||
return ScrapingResult(
|
||||
cleaned_html="",
|
||||
success=False,
|
||||
media=Media(),
|
||||
links=Links(),
|
||||
metadata={}
|
||||
)
|
||||
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
# Convert media items
|
||||
media = Media(
|
||||
images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img],
|
||||
videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid],
|
||||
audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud]
|
||||
)
|
||||
|
||||
# Convert links
|
||||
links = Links(
|
||||
internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link],
|
||||
external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link]
|
||||
)
|
||||
|
||||
return ScrapingResult(
|
||||
cleaned_html=raw_result.get("cleaned_html", ""),
|
||||
success=raw_result.get("success", False),
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=raw_result.get("metadata", {})
|
||||
)
|
||||
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
"""
|
||||
Main entry point for asynchronous content scraping.
|
||||
|
||||
@@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple, Any
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass, field
|
||||
from .ssl_certificate import SSLCertificate
|
||||
@@ -129,3 +129,38 @@ class AsyncCrawlResponse(BaseModel):
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
###############################
|
||||
# Scraping Models
|
||||
###############################
|
||||
class MediaItem(BaseModel):
|
||||
src: str
|
||||
alt: Optional[str] = None
|
||||
desc: Optional[str] = None
|
||||
score: int
|
||||
type: str = "image"
|
||||
group_id: int
|
||||
format: Optional[str] = None
|
||||
width: Optional[int] = None
|
||||
|
||||
class Link(BaseModel):
|
||||
href: str
|
||||
text: str
|
||||
title: Optional[str] = None
|
||||
base_domain: str
|
||||
|
||||
class Media(BaseModel):
|
||||
images: List[MediaItem] = []
|
||||
videos: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Video model if needed
|
||||
audios: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Audio model if needed
|
||||
|
||||
class Links(BaseModel):
|
||||
internal: List[Link] = []
|
||||
external: List[Link] = []
|
||||
|
||||
class ScrapingResult(BaseModel):
|
||||
cleaned_html: str
|
||||
success: bool
|
||||
media: Media = Media()
|
||||
links: Links = Links()
|
||||
metadata: Dict[str, Any] = {}
|
||||
|
||||
@@ -320,14 +320,14 @@ if __name__ == "__main__":
|
||||
|
||||
## 6. Scraping Modes
|
||||
|
||||
Crawl4AI provides two different scraping modes for HTML content processing: BeautifulSoup (default) and LXML. The LXML mode offers significantly better performance, especially for large HTML documents.
|
||||
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ScrapingMode
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
scraping_mode=ScrapingMode.LXML # Faster alternative to default BeautifulSoup
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
@@ -336,20 +336,69 @@ async def main():
|
||||
)
|
||||
```
|
||||
|
||||
You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
|
||||
|
||||
class CustomScrapingStrategy(ContentScrapingStrategy):
|
||||
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
# Implement your custom scraping logic here
|
||||
return ScrapingResult(
|
||||
cleaned_html="<html>...</html>", # Cleaned HTML content
|
||||
success=True, # Whether scraping was successful
|
||||
media=Media(
|
||||
images=[ # List of images found
|
||||
MediaItem(
|
||||
src="https://example.com/image.jpg",
|
||||
alt="Image description",
|
||||
desc="Surrounding text",
|
||||
score=1,
|
||||
type="image",
|
||||
group_id=1,
|
||||
format="jpg",
|
||||
width=800
|
||||
)
|
||||
],
|
||||
videos=[], # List of videos (same structure as images)
|
||||
audios=[] # List of audio files (same structure as images)
|
||||
),
|
||||
links=Links(
|
||||
internal=[ # List of internal links
|
||||
Link(
|
||||
href="https://example.com/page",
|
||||
text="Link text",
|
||||
title="Link title",
|
||||
base_domain="example.com"
|
||||
)
|
||||
],
|
||||
external=[] # List of external links (same structure)
|
||||
),
|
||||
metadata={ # Additional metadata
|
||||
"title": "Page Title",
|
||||
"description": "Page description"
|
||||
}
|
||||
)
|
||||
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
# For simple cases, you can use the sync version
|
||||
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||
```
|
||||
|
||||
### Performance Considerations
|
||||
|
||||
The LXML mode can be up to 10-20x faster than BeautifulSoup mode, particularly when processing large HTML documents. However, please note:
|
||||
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
|
||||
|
||||
1. LXML mode is currently experimental
|
||||
1. LXML strategy is currently experimental
|
||||
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
||||
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
||||
|
||||
Choose LXML mode when:
|
||||
Choose LXML strategy when:
|
||||
- Processing large HTML documents (recommended for >100KB)
|
||||
- Performance is critical
|
||||
- Working with well-formed HTML
|
||||
|
||||
Stick to BeautifulSoup mode (default) when:
|
||||
Stick to BeautifulSoup strategy (default) when:
|
||||
- Maximum compatibility is needed
|
||||
- Working with malformed HTML
|
||||
- Exact parsing behavior is critical
|
||||
|
||||
21
tests/test_scraping_strategy.py
Normal file
21
tests/test_scraping_strategy.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user