diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 1c33b311..86c2cb9e 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,7 +1,8 @@ # __init__.py from .async_webcrawler import AsyncWebCrawler, CacheMode -from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode +from .async_configs import BrowserConfig, CrawlerRunConfig +from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator @@ -14,7 +15,9 @@ __all__ = [ "AsyncWebCrawler", "CrawlResult", "CacheMode", - "ScrapingMode", + "ContentScrapingStrategy", + "WebScrapingStrategy", + "LXMLWebScrapingStrategy", 'BrowserConfig', 'CrawlerRunConfig', 'ExtractionStrategy', diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 3c5c0433..28f90bb3 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -9,10 +9,10 @@ from .config import ( ) from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy -from .chunking_strategy import ChunkingStrategy +from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy +from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from typing import Union, List -from enum import Enum class BrowserConfig: """ @@ -184,12 +184,6 @@ class BrowserConfig: ) -class ScrapingMode(str, Enum): - """Enum for different scraping modes.""" - BEAUTIFULSOUP = "beautifulsoup" - LXML = "lxml" - - class CrawlerRunConfig: """ Configuration class for controlling how the crawler runs each crawl operation. @@ -227,8 +221,8 @@ class CrawlerRunConfig: Default: False. parser_type (str): Type of parser to use for HTML parsing. Default: "lxml". - scraping_mode (ScrapingMode): Scraping mode to use. - Default: ScrapingMode.BEAUTIFULSOUP. + scraping_strategy (ContentScrapingStrategy): Scraping strategy to use. + Default: WebScrapingStrategy. # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. @@ -329,7 +323,7 @@ class CrawlerRunConfig: # Content Processing Parameters word_count_threshold: int = MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), markdown_generator: MarkdownGenerationStrategy = None, content_filter=None, only_text: bool = False, @@ -340,7 +334,7 @@ class CrawlerRunConfig: remove_forms: bool = False, prettiify: bool = False, parser_type: str = "lxml", - scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP, + scraping_strategy: ContentScrapingStrategy = None, # SSL Parameters fetch_ssl_certificate: bool = False, @@ -413,7 +407,7 @@ class CrawlerRunConfig: self.remove_forms = remove_forms self.prettiify = prettiify self.parser_type = parser_type - self.scraping_mode = scraping_mode + self.scraping_strategy = scraping_strategy or WebScrapingStrategy() # SSL Parameters self.fetch_ssl_certificate = fetch_ssl_certificate @@ -480,7 +474,6 @@ class CrawlerRunConfig: # Set default chunking strategy if None if self.chunking_strategy is None: - from .chunking_strategy import RegexChunking self.chunking_strategy = RegexChunking() @staticmethod @@ -489,7 +482,7 @@ class CrawlerRunConfig: # Content Processing Parameters word_count_threshold=kwargs.get("word_count_threshold", 200), extraction_strategy=kwargs.get("extraction_strategy"), - chunking_strategy=kwargs.get("chunking_strategy"), + chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()), markdown_generator=kwargs.get("markdown_generator"), content_filter=kwargs.get("content_filter"), only_text=kwargs.get("only_text", False), @@ -500,7 +493,7 @@ class CrawlerRunConfig: remove_forms=kwargs.get("remove_forms", False), prettiify=kwargs.get("prettiify", False), parser_type=kwargs.get("parser_type", "lxml"), - scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP), + scraping_strategy=kwargs.get("scraping_strategy"), # SSL Parameters fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), @@ -574,7 +567,7 @@ class CrawlerRunConfig: "remove_forms": self.remove_forms, "prettiify": self.prettiify, "parser_type": self.parser_type, - "scraping_mode": self.scraping_mode, + "scraping_strategy": self.scraping_strategy, "fetch_ssl_certificate": self.fetch_ssl_certificate, "cache_mode": self.cache_mode, "session_id": self.session_id, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index f13fdae1..9ef19966 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -543,27 +543,20 @@ class AsyncWebCrawler: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.perf_counter() - # Initialize scraping strategy based on mode - if config.scraping_mode == ScrapingMode.LXML: - scrapping_strategy = LXMLWebScrapingStrategy(logger=self.logger) - else: # Default to BeautifulSoup - scrapping_strategy = WebScrapingStrategy(logger=self.logger) + # Get scraping strategy and ensure it has a logger + scraping_strategy = config.scraping_strategy + if not scraping_strategy.logger: + scraping_strategy.logger = self.logger # Process HTML content params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} # add keys from kwargs to params that doesn't exist in params params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) - result = scrapping_strategy.scrap( + result = scraping_strategy.scrap( url, html, - **params, - # word_count_threshold=config.word_count_threshold, - # css_selector=config.css_selector, - # only_text=config.only_text, - # image_description_min_word_threshold=config.image_description_min_word_threshold, - # content_filter=config.content_filter, - # **kwargs + **params ) if result is None: @@ -576,13 +569,17 @@ class AsyncWebCrawler: - # Extract results - cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) - fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) - fit_html = sanitize_input_encode(result.get("fit_html", "")) - media = result.get("media", []) - links = result.get("links", []) - metadata = result.get("metadata", {}) + # Extract results - handle both dict and ScrapingResult + if isinstance(result, dict): + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + media = result.get("media", {}) + links = result.get("links", {}) + metadata = result.get("metadata", {}) + else: + cleaned_html = sanitize_input_encode(result.cleaned_html) + media = result.media.model_dump() + links = result.links.model_dump() + metadata = result.metadata # Markdown Generation markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() @@ -610,10 +607,7 @@ class AsyncWebCrawler: ) # Handle content extraction if needed - if (extracted_content is None and - config.extraction_strategy and - config.chunking_strategy and - not isinstance(config.extraction_strategy, NoExtractionStrategy)): + if (not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)): t1 = time.perf_counter() @@ -664,8 +658,8 @@ class AsyncWebCrawler: cleaned_html=cleaned_html, markdown_v2=markdown_v2, markdown=markdown, - fit_markdown=fit_markdown, - fit_html=fit_html, + fit_markdown=markdown_result.fit_markdown, + fit_html=markdown_result.fit_html, media=media, links=links, metadata=metadata, diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index e9f631c7..ae09037d 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -21,6 +21,7 @@ from .utils import ( from lxml import etree from lxml import html as lhtml from typing import Dict, Any, List, Tuple +from .models import ScrapingResult, MediaItem, Link, Media, Links # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r'^og:') @@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url): class ContentScrapingStrategy(ABC): @abstractmethod - def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult: pass @abstractmethod - async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: pass class WebScrapingStrategy(ContentScrapingStrategy): @@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): log_method = getattr(self.logger, level) log_method(message=message, tag=tag, **kwargs) - def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult: """ Main entry point for content scraping. @@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy): **kwargs: Additional keyword arguments. Returns: - Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: - - - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. - - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. - - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. - - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + ScrapingResult: A structured result containing the scraped content. """ - return self._scrap(url, html, is_async=False, **kwargs) + raw_result = self._scrap(url, html, is_async=False, **kwargs) + if raw_result is None: + return ScrapingResult( + cleaned_html="", + success=False, + media=Media(), + links=Links(), + metadata={} + ) - async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + # Convert media items + media = Media( + images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img], + videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid], + audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud] + ) + + # Convert links + links = Links( + internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link], + external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link] + ) + + return ScrapingResult( + cleaned_html=raw_result.get("cleaned_html", ""), + success=raw_result.get("success", False), + media=media, + links=links, + metadata=raw_result.get("metadata", {}) + ) + + async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: """ Main entry point for asynchronous content scraping. @@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): **kwargs: Additional keyword arguments. Returns: - Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: - - - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. - - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. - - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. - - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + ScrapingResult: A structured result containing the scraped content. """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 1e2b4794..48aad544 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple +from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple, Any from enum import Enum from dataclasses import dataclass, field from .ssl_certificate import SSLCertificate @@ -129,3 +129,38 @@ class AsyncCrawlResponse(BaseModel): class Config: arbitrary_types_allowed = True + +############################### +# Scraping Models +############################### +class MediaItem(BaseModel): + src: str + alt: Optional[str] = None + desc: Optional[str] = None + score: int + type: str = "image" + group_id: int + format: Optional[str] = None + width: Optional[int] = None + +class Link(BaseModel): + href: str + text: str + title: Optional[str] = None + base_domain: str + +class Media(BaseModel): + images: List[MediaItem] = [] + videos: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Video model if needed + audios: List[MediaItem] = [] # Using MediaItem model for now, can be extended with Audio model if needed + +class Links(BaseModel): + internal: List[Link] = [] + external: List[Link] = [] + +class ScrapingResult(BaseModel): + cleaned_html: str + success: bool + media: Media = Media() + links: Links = Links() + metadata: Dict[str, Any] = {} diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md index 254081ae..5d46ef10 100644 --- a/docs/md_v2/core/content-selection.md +++ b/docs/md_v2/core/content-selection.md @@ -320,14 +320,14 @@ if __name__ == "__main__": ## 6. Scraping Modes -Crawl4AI provides two different scraping modes for HTML content processing: BeautifulSoup (default) and LXML. The LXML mode offers significantly better performance, especially for large HTML documents. +Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents. ```python -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ScrapingMode +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy async def main(): config = CrawlerRunConfig( - scraping_mode=ScrapingMode.LXML # Faster alternative to default BeautifulSoup + scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup ) async with AsyncWebCrawler() as crawler: result = await crawler.arun( @@ -336,20 +336,69 @@ async def main(): ) ``` +You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure: + +```python +from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links + +class CustomScrapingStrategy(ContentScrapingStrategy): + def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult: + # Implement your custom scraping logic here + return ScrapingResult( + cleaned_html="...", # Cleaned HTML content + success=True, # Whether scraping was successful + media=Media( + images=[ # List of images found + MediaItem( + src="https://example.com/image.jpg", + alt="Image description", + desc="Surrounding text", + score=1, + type="image", + group_id=1, + format="jpg", + width=800 + ) + ], + videos=[], # List of videos (same structure as images) + audios=[] # List of audio files (same structure as images) + ), + links=Links( + internal=[ # List of internal links + Link( + href="https://example.com/page", + text="Link text", + title="Link title", + base_domain="example.com" + ) + ], + external=[] # List of external links (same structure) + ), + metadata={ # Additional metadata + "title": "Page Title", + "description": "Page description" + } + ) + + async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: + # For simple cases, you can use the sync version + return await asyncio.to_thread(self.scrap, url, html, **kwargs) +``` + ### Performance Considerations -The LXML mode can be up to 10-20x faster than BeautifulSoup mode, particularly when processing large HTML documents. However, please note: +The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note: -1. LXML mode is currently experimental +1. LXML strategy is currently experimental 2. In some edge cases, the parsing results might differ slightly from BeautifulSoup 3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example -Choose LXML mode when: +Choose LXML strategy when: - Processing large HTML documents (recommended for >100KB) - Performance is critical - Working with well-formed HTML -Stick to BeautifulSoup mode (default) when: +Stick to BeautifulSoup strategy (default) when: - Maximum compatibility is needed - Working with malformed HTML - Exact parsing behavior is critical diff --git a/tests/test_scraping_strategy.py b/tests/test_scraping_strategy.py new file mode 100644 index 00000000..6d742182 --- /dev/null +++ b/tests/test_scraping_strategy.py @@ -0,0 +1,21 @@ +import nest_asyncio +nest_asyncio.apply() + +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode + +async def main(): + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + config=config + ) + print(f"Success: {result.success}") + print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}") + +if __name__ == "__main__": + asyncio.run(main())