refactor(scraping): replace ScrapingMode enum with strategy pattern
Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping. This change introduces: - New ContentScrapingStrategy abstract base class - Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations - New Pydantic models for structured scraping results - Updated documentation reflecting the new strategy-based approach BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
This commit is contained in:
@@ -1,7 +1,8 @@
|
|||||||
# __init__.py
|
# __init__.py
|
||||||
|
|
||||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||||
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
@@ -14,7 +15,9 @@ __all__ = [
|
|||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
"CrawlResult",
|
"CrawlResult",
|
||||||
"CacheMode",
|
"CacheMode",
|
||||||
"ScrapingMode",
|
"ContentScrapingStrategy",
|
||||||
|
"WebScrapingStrategy",
|
||||||
|
"LXMLWebScrapingStrategy",
|
||||||
'BrowserConfig',
|
'BrowserConfig',
|
||||||
'CrawlerRunConfig',
|
'CrawlerRunConfig',
|
||||||
'ExtractionStrategy',
|
'ExtractionStrategy',
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ from .config import (
|
|||||||
)
|
)
|
||||||
from .user_agent_generator import UserAgentGenerator
|
from .user_agent_generator import UserAgentGenerator
|
||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||||
|
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||||
from typing import Union, List
|
from typing import Union, List
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
@@ -184,12 +184,6 @@ class BrowserConfig:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class ScrapingMode(str, Enum):
|
|
||||||
"""Enum for different scraping modes."""
|
|
||||||
BEAUTIFULSOUP = "beautifulsoup"
|
|
||||||
LXML = "lxml"
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlerRunConfig:
|
class CrawlerRunConfig:
|
||||||
"""
|
"""
|
||||||
Configuration class for controlling how the crawler runs each crawl operation.
|
Configuration class for controlling how the crawler runs each crawl operation.
|
||||||
@@ -227,8 +221,8 @@ class CrawlerRunConfig:
|
|||||||
Default: False.
|
Default: False.
|
||||||
parser_type (str): Type of parser to use for HTML parsing.
|
parser_type (str): Type of parser to use for HTML parsing.
|
||||||
Default: "lxml".
|
Default: "lxml".
|
||||||
scraping_mode (ScrapingMode): Scraping mode to use.
|
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||||
Default: ScrapingMode.BEAUTIFULSOUP.
|
Default: WebScrapingStrategy.
|
||||||
|
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||||
@@ -329,7 +323,7 @@ class CrawlerRunConfig:
|
|||||||
# Content Processing Parameters
|
# Content Processing Parameters
|
||||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = None,
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
markdown_generator: MarkdownGenerationStrategy = None,
|
markdown_generator: MarkdownGenerationStrategy = None,
|
||||||
content_filter=None,
|
content_filter=None,
|
||||||
only_text: bool = False,
|
only_text: bool = False,
|
||||||
@@ -340,7 +334,7 @@ class CrawlerRunConfig:
|
|||||||
remove_forms: bool = False,
|
remove_forms: bool = False,
|
||||||
prettiify: bool = False,
|
prettiify: bool = False,
|
||||||
parser_type: str = "lxml",
|
parser_type: str = "lxml",
|
||||||
scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP,
|
scraping_strategy: ContentScrapingStrategy = None,
|
||||||
|
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate: bool = False,
|
fetch_ssl_certificate: bool = False,
|
||||||
@@ -413,7 +407,7 @@ class CrawlerRunConfig:
|
|||||||
self.remove_forms = remove_forms
|
self.remove_forms = remove_forms
|
||||||
self.prettiify = prettiify
|
self.prettiify = prettiify
|
||||||
self.parser_type = parser_type
|
self.parser_type = parser_type
|
||||||
self.scraping_mode = scraping_mode
|
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||||
|
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||||
@@ -480,7 +474,6 @@ class CrawlerRunConfig:
|
|||||||
|
|
||||||
# Set default chunking strategy if None
|
# Set default chunking strategy if None
|
||||||
if self.chunking_strategy is None:
|
if self.chunking_strategy is None:
|
||||||
from .chunking_strategy import RegexChunking
|
|
||||||
self.chunking_strategy = RegexChunking()
|
self.chunking_strategy = RegexChunking()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -489,7 +482,7 @@ class CrawlerRunConfig:
|
|||||||
# Content Processing Parameters
|
# Content Processing Parameters
|
||||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||||
chunking_strategy=kwargs.get("chunking_strategy"),
|
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
||||||
markdown_generator=kwargs.get("markdown_generator"),
|
markdown_generator=kwargs.get("markdown_generator"),
|
||||||
content_filter=kwargs.get("content_filter"),
|
content_filter=kwargs.get("content_filter"),
|
||||||
only_text=kwargs.get("only_text", False),
|
only_text=kwargs.get("only_text", False),
|
||||||
@@ -500,7 +493,7 @@ class CrawlerRunConfig:
|
|||||||
remove_forms=kwargs.get("remove_forms", False),
|
remove_forms=kwargs.get("remove_forms", False),
|
||||||
prettiify=kwargs.get("prettiify", False),
|
prettiify=kwargs.get("prettiify", False),
|
||||||
parser_type=kwargs.get("parser_type", "lxml"),
|
parser_type=kwargs.get("parser_type", "lxml"),
|
||||||
scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP),
|
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||||
|
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||||
@@ -574,7 +567,7 @@ class CrawlerRunConfig:
|
|||||||
"remove_forms": self.remove_forms,
|
"remove_forms": self.remove_forms,
|
||||||
"prettiify": self.prettiify,
|
"prettiify": self.prettiify,
|
||||||
"parser_type": self.parser_type,
|
"parser_type": self.parser_type,
|
||||||
"scraping_mode": self.scraping_mode,
|
"scraping_strategy": self.scraping_strategy,
|
||||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||||
"cache_mode": self.cache_mode,
|
"cache_mode": self.cache_mode,
|
||||||
"session_id": self.session_id,
|
"session_id": self.session_id,
|
||||||
|
|||||||
@@ -543,27 +543,20 @@ class AsyncWebCrawler:
|
|||||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
# Initialize scraping strategy based on mode
|
# Get scraping strategy and ensure it has a logger
|
||||||
if config.scraping_mode == ScrapingMode.LXML:
|
scraping_strategy = config.scraping_strategy
|
||||||
scrapping_strategy = LXMLWebScrapingStrategy(logger=self.logger)
|
if not scraping_strategy.logger:
|
||||||
else: # Default to BeautifulSoup
|
scraping_strategy.logger = self.logger
|
||||||
scrapping_strategy = WebScrapingStrategy(logger=self.logger)
|
|
||||||
|
|
||||||
# Process HTML content
|
# Process HTML content
|
||||||
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
|
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
|
||||||
# add keys from kwargs to params that doesn't exist in params
|
# add keys from kwargs to params that doesn't exist in params
|
||||||
params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
|
params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
|
||||||
|
|
||||||
result = scrapping_strategy.scrap(
|
result = scraping_strategy.scrap(
|
||||||
url,
|
url,
|
||||||
html,
|
html,
|
||||||
**params,
|
**params
|
||||||
# word_count_threshold=config.word_count_threshold,
|
|
||||||
# css_selector=config.css_selector,
|
|
||||||
# only_text=config.only_text,
|
|
||||||
# image_description_min_word_threshold=config.image_description_min_word_threshold,
|
|
||||||
# content_filter=config.content_filter,
|
|
||||||
# **kwargs
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -576,13 +569,17 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Extract results
|
# Extract results - handle both dict and ScrapingResult
|
||||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
if isinstance(result, dict):
|
||||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||||
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
media = result.get("media", {})
|
||||||
media = result.get("media", [])
|
links = result.get("links", {})
|
||||||
links = result.get("links", [])
|
metadata = result.get("metadata", {})
|
||||||
metadata = result.get("metadata", {})
|
else:
|
||||||
|
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||||
|
media = result.media.model_dump()
|
||||||
|
links = result.links.model_dump()
|
||||||
|
metadata = result.metadata
|
||||||
|
|
||||||
# Markdown Generation
|
# Markdown Generation
|
||||||
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
||||||
@@ -610,10 +607,7 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Handle content extraction if needed
|
# Handle content extraction if needed
|
||||||
if (extracted_content is None and
|
if (not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)):
|
||||||
config.extraction_strategy and
|
|
||||||
config.chunking_strategy and
|
|
||||||
not isinstance(config.extraction_strategy, NoExtractionStrategy)):
|
|
||||||
|
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
@@ -664,8 +658,8 @@ class AsyncWebCrawler:
|
|||||||
cleaned_html=cleaned_html,
|
cleaned_html=cleaned_html,
|
||||||
markdown_v2=markdown_v2,
|
markdown_v2=markdown_v2,
|
||||||
markdown=markdown,
|
markdown=markdown,
|
||||||
fit_markdown=fit_markdown,
|
fit_markdown=markdown_result.fit_markdown,
|
||||||
fit_html=fit_html,
|
fit_html=markdown_result.fit_html,
|
||||||
media=media,
|
media=media,
|
||||||
links=links,
|
links=links,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from .utils import (
|
|||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml import html as lhtml
|
from lxml import html as lhtml
|
||||||
from typing import Dict, Any, List, Tuple
|
from typing import Dict, Any, List, Tuple
|
||||||
|
from .models import ScrapingResult, MediaItem, Link, Media, Links
|
||||||
|
|
||||||
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
# Pre-compile regular expressions for Open Graph and Twitter metadata
|
||||||
OG_REGEX = re.compile(r'^og:')
|
OG_REGEX = re.compile(r'^og:')
|
||||||
@@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url):
|
|||||||
|
|
||||||
class ContentScrapingStrategy(ABC):
|
class ContentScrapingStrategy(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||||
@@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
log_method = getattr(self.logger, level)
|
log_method = getattr(self.logger, level)
|
||||||
log_method(message=message, tag=tag, **kwargs)
|
log_method(message=message, tag=tag, **kwargs)
|
||||||
|
|
||||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
"""
|
"""
|
||||||
Main entry point for content scraping.
|
Main entry point for content scraping.
|
||||||
|
|
||||||
@@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
**kwargs: Additional keyword arguments.
|
**kwargs: Additional keyword arguments.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
ScrapingResult: A structured result containing the scraped content.
|
||||||
|
|
||||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
|
||||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
|
||||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
|
||||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
|
||||||
"""
|
"""
|
||||||
return self._scrap(url, html, is_async=False, **kwargs)
|
raw_result = self._scrap(url, html, is_async=False, **kwargs)
|
||||||
|
if raw_result is None:
|
||||||
|
return ScrapingResult(
|
||||||
|
cleaned_html="",
|
||||||
|
success=False,
|
||||||
|
media=Media(),
|
||||||
|
links=Links(),
|
||||||
|
metadata={}
|
||||||
|
)
|
||||||
|
|
||||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
# Convert media items
|
||||||
|
media = Media(
|
||||||
|
images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img],
|
||||||
|
videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid],
|
||||||
|
audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert links
|
||||||
|
links = Links(
|
||||||
|
internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link],
|
||||||
|
external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link]
|
||||||
|
)
|
||||||
|
|
||||||
|
return ScrapingResult(
|
||||||
|
cleaned_html=raw_result.get("cleaned_html", ""),
|
||||||
|
success=raw_result.get("success", False),
|
||||||
|
media=media,
|
||||||
|
links=links,
|
||||||
|
metadata=raw_result.get("metadata", {})
|
||||||
|
)
|
||||||
|
|
||||||
|
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
"""
|
"""
|
||||||
Main entry point for asynchronous content scraping.
|
Main entry point for asynchronous content scraping.
|
||||||
|
|
||||||
@@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
**kwargs: Additional keyword arguments.
|
**kwargs: Additional keyword arguments.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
ScrapingResult: A structured result containing the scraped content.
|
||||||
|
|
||||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
|
||||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
|
||||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
|
||||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
|
||||||
"""
|
"""
|
||||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
from pydantic import BaseModel, HttpUrl
|
from pydantic import BaseModel, HttpUrl
|
||||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple
|
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple, Any
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from .ssl_certificate import SSLCertificate
|
from .ssl_certificate import SSLCertificate
|
||||||
@@ -129,3 +129,38 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
###############################
|
||||||
|
# Scraping Models
|
||||||
|
###############################
|
||||||
|
class MediaItem(BaseModel):
|
||||||
|
src: str
|
||||||
|
alt: Optional[str] = None
|
||||||
|
desc: Optional[str] = None
|
||||||
|
score: int
|
||||||
|
type: str = "image"
|
||||||
|
group_id: int
|
||||||
|
format: Optional[str] = None
|
||||||
|
width: Optional[int] = None
|
||||||
|
|
||||||
|
class Link(BaseModel):
|
||||||
|
href: str
|
||||||
|
text: str
|
||||||
|
title: Optional[str] = None
|
||||||
|
base_domain: str
|
||||||
|
|
||||||
|
class Media(BaseModel):
|
||||||
|
images: List[MediaItem] = []
|
||||||
|
videos: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Video model if needed
|
||||||
|
audios: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Audio model if needed
|
||||||
|
|
||||||
|
class Links(BaseModel):
|
||||||
|
internal: List[Link] = []
|
||||||
|
external: List[Link] = []
|
||||||
|
|
||||||
|
class ScrapingResult(BaseModel):
|
||||||
|
cleaned_html: str
|
||||||
|
success: bool
|
||||||
|
media: Media = Media()
|
||||||
|
links: Links = Links()
|
||||||
|
metadata: Dict[str, Any] = {}
|
||||||
|
|||||||
@@ -320,14 +320,14 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
## 6. Scraping Modes
|
## 6. Scraping Modes
|
||||||
|
|
||||||
Crawl4AI provides two different scraping modes for HTML content processing: BeautifulSoup (default) and LXML. The LXML mode offers significantly better performance, especially for large HTML documents.
|
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ScrapingMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
scraping_mode=ScrapingMode.LXML # Faster alternative to default BeautifulSoup
|
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||||
)
|
)
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
@@ -336,20 +336,69 @@ async def main():
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
|
||||||
|
|
||||||
|
class CustomScrapingStrategy(ContentScrapingStrategy):
|
||||||
|
def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
|
# Implement your custom scraping logic here
|
||||||
|
return ScrapingResult(
|
||||||
|
cleaned_html="<html>...</html>", # Cleaned HTML content
|
||||||
|
success=True, # Whether scraping was successful
|
||||||
|
media=Media(
|
||||||
|
images=[ # List of images found
|
||||||
|
MediaItem(
|
||||||
|
src="https://example.com/image.jpg",
|
||||||
|
alt="Image description",
|
||||||
|
desc="Surrounding text",
|
||||||
|
score=1,
|
||||||
|
type="image",
|
||||||
|
group_id=1,
|
||||||
|
format="jpg",
|
||||||
|
width=800
|
||||||
|
)
|
||||||
|
],
|
||||||
|
videos=[], # List of videos (same structure as images)
|
||||||
|
audios=[] # List of audio files (same structure as images)
|
||||||
|
),
|
||||||
|
links=Links(
|
||||||
|
internal=[ # List of internal links
|
||||||
|
Link(
|
||||||
|
href="https://example.com/page",
|
||||||
|
text="Link text",
|
||||||
|
title="Link title",
|
||||||
|
base_domain="example.com"
|
||||||
|
)
|
||||||
|
],
|
||||||
|
external=[] # List of external links (same structure)
|
||||||
|
),
|
||||||
|
metadata={ # Additional metadata
|
||||||
|
"title": "Page Title",
|
||||||
|
"description": "Page description"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
|
# For simple cases, you can use the sync version
|
||||||
|
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||||
|
```
|
||||||
|
|
||||||
### Performance Considerations
|
### Performance Considerations
|
||||||
|
|
||||||
The LXML mode can be up to 10-20x faster than BeautifulSoup mode, particularly when processing large HTML documents. However, please note:
|
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
|
||||||
|
|
||||||
1. LXML mode is currently experimental
|
1. LXML strategy is currently experimental
|
||||||
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
||||||
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
||||||
|
|
||||||
Choose LXML mode when:
|
Choose LXML strategy when:
|
||||||
- Processing large HTML documents (recommended for >100KB)
|
- Processing large HTML documents (recommended for >100KB)
|
||||||
- Performance is critical
|
- Performance is critical
|
||||||
- Working with well-formed HTML
|
- Working with well-formed HTML
|
||||||
|
|
||||||
Stick to BeautifulSoup mode (default) when:
|
Stick to BeautifulSoup strategy (default) when:
|
||||||
- Maximum compatibility is needed
|
- Maximum compatibility is needed
|
||||||
- Working with malformed HTML
|
- Working with malformed HTML
|
||||||
- Exact parsing behavior is critical
|
- Exact parsing behavior is critical
|
||||||
|
|||||||
21
tests/test_scraping_strategy.py
Normal file
21
tests/test_scraping_strategy.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import nest_asyncio
|
||||||
|
nest_asyncio.apply()
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||||
|
)
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user