diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dd26623..96b1eb0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,32 @@ All notable changes to Crawl4AI will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Version 0.5.0 (2025-03-02) + +### Added + +- *(profiles)* Add BrowserProfiler class for dedicated browser profile management +- *(cli)* Add interactive profile management to CLI with rich UI +- *(profiles)* Add ability to crawl directly from profile management interface +- *(browser)* Support identity-based browsing with persistent profiles +- *(deep-crawling)* Add max_pages parameter to limit the number of pages crawled in all deep crawling strategies +- *(deep-crawling)* Add score_threshold parameter to BFS and DFS strategies to filter URLs by score + +### Changed + +- *(browser)* Refactor profile management from ManagedBrowser to BrowserProfiler class +- *(cli)* Enhance CLI with profile selection and status display for crawling +- *(examples)* Update identity-based browsing example to use BrowserProfiler class +- *(docs)* Update identity-based crawling documentation +- *(docs)* Update deep crawling documentation with max_pages and score_threshold parameters +- *(examples)* Add example demonstrating the use of max_pages and score_threshold parameters + +### Fixed + +- *(browser)* Fix profile detection and management on different platforms +- *(cli)* Fix CLI command structure for better user experience +- *(deep-crawling)* Improve BFS and DFS strategies to handle page count limits more efficiently + ## Version 0.5.0 (2025-02-21) diff --git a/README.md b/README.md index 09874b88..e98af5e7 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.4.3bx](#-recent-updates) +[✨ Check out latest update v0.5.0](#-recent-updates) -πŸŽ‰ **Version 0.4.3bx is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes β†’](https://docs.crawl4ai.com/blog) +πŸŽ‰ **Version 0.5.0 is out!** This major release introduces Deep Crawling with BFS/DFS/BestFirst strategies, Memory-Adaptive Dispatcher, Multiple Crawling Strategies (Playwright and HTTP), Docker Deployment with FastAPI, Command-Line Interface (CLI), and more! [Read the release notes β†’](https://docs.crawl4ai.com/blog)
πŸ€“ My Personal Story @@ -68,7 +68,7 @@ If you encounter any browser-related issues, you can install them manually: python -m playwright install --with-deps chromium ``` -2. Run a simple web crawl: +2. Run a simple web crawl with Python: ```python import asyncio from crawl4ai import * @@ -84,6 +84,18 @@ if __name__ == "__main__": asyncio.run(main()) ``` +3. Or use the new command-line interface: +```bash +# Basic crawl with markdown output +crwl https://www.nbcnews.com/business -o markdown + +# Deep crawl with BFS strategy, max 10 pages +crwl https://docs.crawl4ai.com --deep-crawl bfs --max-pages 10 + +# Use LLM extraction with a specific question +crwl https://www.example.com/products -q "Extract all product prices" +``` + ## ✨ Features
@@ -112,6 +124,7 @@ if __name__ == "__main__": - πŸ–₯️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection. - πŸ”„ **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction. +- πŸ‘€ **Browser Profiler**: Create and manage persistent profiles with saved authentication states, cookies, and settings. - πŸ”’ **Session Management**: Preserve browser states and reuse them for multi-step crawling. - 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access. - βš™οΈ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups. @@ -140,10 +153,11 @@ if __name__ == "__main__":
πŸš€ Deployment -- 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment. +- 🐳 **Dockerized Setup**: Optimized Docker image with FastAPI server for easy deployment. +- πŸ”‘ **Secure Authentication**: Built-in JWT token authentication for API security. - πŸ”„ **API Gateway**: One-click deployment with secure token authentication for API-based workflows. - 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance. -- βš™οΈ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms. +- ☁️ **Cloud Deployment**: Ready-to-deploy configurations for major cloud platforms.
@@ -406,7 +420,7 @@ if __name__ == "__main__": ```python import os import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy from pydantic import BaseModel, Field @@ -422,7 +436,7 @@ async def main(): extraction_strategy=LLMExtractionStrategy( # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2 # provider="ollama/qwen2", api_token="no-token", - llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), + llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), schema=OpenAIModelFee.schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. @@ -486,21 +500,31 @@ async def test_news_crawl(): ## ✨ Recent Updates -- **πŸš€ New Dispatcher System**: Scale to thousands of URLs with intelligent **memory monitoring**, **concurrency control**, and optional **rate limiting**. (See `MemoryAdaptiveDispatcher`, `SemaphoreDispatcher`, `RateLimiter`, `CrawlerMonitor`) -- **⚑ Streaming Mode**: Process results **as they arrive** instead of waiting for an entire batch to complete. (Set `stream=True` in `CrawlerRunConfig`) -- **πŸ€– Enhanced LLM Integration**: - - **Automatic schema generation**: Create extraction rules from HTML using OpenAI or Ollama, no manual CSS/XPath needed. - - **LLM-powered Markdown filtering**: Refine your markdown output with a new `LLMContentFilter` that understands content relevance. - - **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction. -- **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental). -- **πŸ€– robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching. -- **πŸ”„ Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence. -- **➑️ URL Redirection Tracking**: The `redirected_url` field now captures the final destination after any redirects. -- **πŸͺž Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites. -- **πŸ“ˆ Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`. -- **πŸ“ Improved Documentation**: More examples, clearer explanations, and updated tutorials. +### Version 0.5.0 Major Release Highlights -Read the full details in our [0.4.3bx Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). +- **πŸš€ Deep Crawling System**: Explore websites beyond initial URLs with three strategies: + - **BFS Strategy**: Breadth-first search explores websites level by level + - **DFS Strategy**: Depth-first search explores each branch deeply before backtracking + - **BestFirst Strategy**: Uses scoring functions to prioritize which URLs to crawl next + - **Page Limiting**: Control the maximum number of pages to crawl with `max_pages` parameter + - **Score Thresholds**: Filter URLs based on relevance scores +- **⚑ Memory-Adaptive Dispatcher**: Dynamically adjusts concurrency based on system memory with built-in rate limiting +- **πŸ”„ Multiple Crawling Strategies**: + - **AsyncPlaywrightCrawlerStrategy**: Browser-based crawling with JavaScript support (Default) + - **AsyncHTTPCrawlerStrategy**: Fast, lightweight HTTP-only crawler for simple tasks +- **🐳 Docker Deployment**: Easy deployment with FastAPI server and streaming/non-streaming endpoints +- **πŸ’» Command-Line Interface**: New `crwl` CLI provides convenient terminal access to all features with intuitive commands and configuration options +- **πŸ‘€ Browser Profiler**: Create and manage persistent browser profiles to save authentication states, cookies, and settings for seamless crawling of protected content +- **🧠 Crawl4AI Coding Assistant**: AI-powered coding assistant to answer your question for Crawl4ai, and generate proper code for crawling. +- **🏎️ LXML Scraping Mode**: Fast HTML parsing using the `lxml` library for improved performance +- **🌐 Proxy Rotation**: Built-in support for proxy switching with `RoundRobinProxyStrategy` +- **πŸ€– LLM Content Filter**: Intelligent markdown generation using LLMs +- **πŸ“„ PDF Processing**: Extract text, images, and metadata from PDF files +- **πŸ”— URL Redirection Tracking**: Automatically follow and record HTTP redirects +- **πŸ€– LLM Schema Generation**: Easily create extraction schemas with LLM assistance +- **πŸ” robots.txt Compliance**: Respect website crawling rules + +Read the full details in our [0.5.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.5.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). ## Version Numbering in Crawl4AI diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 12322540..ff238964 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -2,7 +2,8 @@ import warnings from .async_webcrawler import AsyncWebCrawler, CacheMode -from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig + from .content_scraping_strategy import ( ContentScrapingStrategy, WebScrapingStrategy, @@ -22,6 +23,7 @@ from .extraction_strategy import ( CosineStrategy, JsonCssExtractionStrategy, JsonXPathExtractionStrategy, + JsonLxmlExtractionStrategy ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator @@ -42,12 +44,14 @@ from .async_dispatcher import ( ) from .docker_client import Crawl4aiDockerClient from .hub import CrawlerHub +from .browser_profiler import BrowserProfiler from .deep_crawling import ( DeepCrawlStrategy, BFSDeepCrawlStrategy, FilterChain, - ContentTypeFilter, + URLPatternFilter, DomainFilter, + ContentTypeFilter, URLFilter, FilterStats, SEOFilter, @@ -66,11 +70,14 @@ __all__ = [ "AsyncLoggerBase", "AsyncLogger", "AsyncWebCrawler", + "BrowserProfiler", + "LLMConfig", "DeepCrawlStrategy", "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", "FilterChain", + "URLPatternFilter", "ContentTypeFilter", "DomainFilter", "FilterStats", @@ -97,6 +104,7 @@ __all__ = [ "CosineStrategy", "JsonCssExtractionStrategy", "JsonXPathExtractionStrategy", + "JsonLxmlExtractionStrategy", "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 77366e02..9477177b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.5.0" +__version__ = "0.5.0.post4" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 4c89d506..937ae4eb 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -11,19 +11,23 @@ from .config import ( ) from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator -from .extraction_strategy import ExtractionStrategy +from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking + from .markdown_generation_strategy import MarkdownGenerationStrategy from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from .deep_crawling import DeepCrawlStrategy -from typing import Union, List + from .cache_context import CacheMode from .proxy_strategy import ProxyRotationStrategy +from typing import Union, List import inspect from typing import Any, Dict, Optional from enum import Enum +from .proxy_strategy import ProxyConfig + def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: """ @@ -178,7 +182,7 @@ class BrowserConfig: is "chromium". Default: "chromium". proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. Default: None. - proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. + proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. viewport_width (int): Default viewport width for pages. Default: 1080. viewport_height (int): Default viewport height for pages. Default: 600. @@ -223,7 +227,7 @@ class BrowserConfig: chrome_channel: str = "chromium", channel: str = "chromium", proxy: str = None, - proxy_config: dict = None, + proxy_config: Union[ProxyConfig, dict, None] = None, viewport_width: int = 1080, viewport_height: int = 600, viewport: dict = None, @@ -313,7 +317,7 @@ class BrowserConfig: chrome_channel=kwargs.get("chrome_channel", "chromium"), channel=kwargs.get("channel", "chromium"), proxy=kwargs.get("proxy"), - proxy_config=kwargs.get("proxy_config"), + proxy_config=kwargs.get("proxy_config", None), viewport_width=kwargs.get("viewport_width", 1080), viewport_height=kwargs.get("viewport_height", 600), accept_downloads=kwargs.get("accept_downloads", False), @@ -497,6 +501,15 @@ class CrawlerRunConfig(): Default: False. css_selector (str or None): CSS selector to extract a specific portion of the page. Default: None. + + target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation + and structured data extraction. When you set this, only the contents + of these elements are processed for extraction and Markdown generation. + If you do not set any value, the entire page is processed. + The difference between this and css_selector is that this will shrink + the initial raw HTML to the selected element, while this will only affect + the extraction and Markdown generation. + Default: None excluded_tags (list of str or None): List of HTML tags to exclude from processing. Default: None. excluded_selector (str or None): CSS selector to exclude from processing. @@ -513,7 +526,7 @@ class CrawlerRunConfig(): Default: "lxml". scraping_strategy (ContentScrapingStrategy): Scraping strategy to use. Default: WebScrapingStrategy. - proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. + proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. # SSL Parameters @@ -593,6 +606,8 @@ class CrawlerRunConfig(): Default: IMAGE_SCORE_THRESHOLD (e.g., 3). exclude_external_images (bool): If True, exclude all external images from processing. Default: False. + table_score_threshold (int): Minimum score threshold for processing a table. + Default: 7. # Link and Domain Handling Parameters exclude_social_media_domains (list of str): List of domains to exclude for social media links. @@ -646,6 +661,7 @@ class CrawlerRunConfig(): markdown_generator: MarkdownGenerationStrategy = None, only_text: bool = False, css_selector: str = None, + target_elements: List[str] = None, excluded_tags: list = None, excluded_selector: str = None, keep_data_attributes: bool = False, @@ -654,7 +670,7 @@ class CrawlerRunConfig(): prettiify: bool = False, parser_type: str = "lxml", scraping_strategy: ContentScrapingStrategy = None, - proxy_config: dict = None, + proxy_config: Union[ProxyConfig, dict, None] = None, proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None, # SSL Parameters fetch_ssl_certificate: bool = False, @@ -694,6 +710,7 @@ class CrawlerRunConfig(): pdf: bool = False, image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, image_score_threshold: int = IMAGE_SCORE_THRESHOLD, + table_score_threshold: int = 7, exclude_external_images: bool = False, # Link and Domain Handling Parameters exclude_social_media_domains: list = None, @@ -725,6 +742,7 @@ class CrawlerRunConfig(): self.markdown_generator = markdown_generator self.only_text = only_text self.css_selector = css_selector + self.target_elements = target_elements or [] self.excluded_tags = excluded_tags or [] self.excluded_selector = excluded_selector or "" self.keep_data_attributes = keep_data_attributes @@ -779,6 +797,7 @@ class CrawlerRunConfig(): self.image_description_min_word_threshold = image_description_min_word_threshold self.image_score_threshold = image_score_threshold self.exclude_external_images = exclude_external_images + self.table_score_threshold = table_score_threshold # Link and Domain Handling Parameters self.exclude_social_media_domains = ( @@ -854,6 +873,7 @@ class CrawlerRunConfig(): markdown_generator=kwargs.get("markdown_generator"), only_text=kwargs.get("only_text", False), css_selector=kwargs.get("css_selector"), + target_elements=kwargs.get("target_elements", []), excluded_tags=kwargs.get("excluded_tags", []), excluded_selector=kwargs.get("excluded_selector", ""), keep_data_attributes=kwargs.get("keep_data_attributes", False), @@ -909,6 +929,7 @@ class CrawlerRunConfig(): image_score_threshold=kwargs.get( "image_score_threshold", IMAGE_SCORE_THRESHOLD ), + table_score_threshold=kwargs.get("table_score_threshold", 7), exclude_external_images=kwargs.get("exclude_external_images", False), # Link and Domain Handling Parameters exclude_social_media_domains=kwargs.get( @@ -954,6 +975,7 @@ class CrawlerRunConfig(): "markdown_generator": self.markdown_generator, "only_text": self.only_text, "css_selector": self.css_selector, + "target_elements": self.target_elements, "excluded_tags": self.excluded_tags, "excluded_selector": self.excluded_selector, "keep_data_attributes": self.keep_data_attributes, @@ -997,6 +1019,7 @@ class CrawlerRunConfig(): "pdf": self.pdf, "image_description_min_word_threshold": self.image_description_min_word_threshold, "image_score_threshold": self.image_score_threshold, + "table_score_threshold": self.table_score_threshold, "exclude_external_images": self.exclude_external_images, "exclude_social_media_domains": self.exclude_social_media_domains, "exclude_external_links": self.exclude_external_links, @@ -1042,7 +1065,7 @@ class CrawlerRunConfig(): return CrawlerRunConfig.from_kwargs(config_dict) -class LlmConfig: +class LLMConfig: def __init__( self, provider: str = DEFAULT_PROVIDER, @@ -1063,8 +1086,8 @@ class LlmConfig: @staticmethod - def from_kwargs(kwargs: dict) -> "LlmConfig": - return LlmConfig( + def from_kwargs(kwargs: dict) -> "LLMConfig": + return LLMConfig( provider=kwargs.get("provider", DEFAULT_PROVIDER), api_token=kwargs.get("api_token"), base_url=kwargs.get("base_url"), @@ -1084,8 +1107,10 @@ class LlmConfig: **kwargs: Key-value pairs of configuration options to update Returns: - LLMConfig: A new instance with the specified updates + llm_config: A new instance with the specified updates """ config_dict = self.to_dict() config_dict.update(kwargs) - return LlmConfig.from_kwargs(config_dict) + return LLMConfig.from_kwargs(config_dict) + + diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 19b6a689..960c2d6f 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -767,6 +767,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Handle wait_for condition # Todo: Decide how to handle this if not config.wait_for and config.css_selector and False: + # if not config.wait_for and config.css_selector: config.wait_for = f"css:{config.css_selector}" if config.wait_for: @@ -806,8 +807,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.remove_overlay_elements: await self.remove_overlay_elements(page) - # Get final HTML content - html = await page.content() + if config.css_selector: + try: + # Handle comma-separated selectors by splitting them + selectors = [s.strip() for s in config.css_selector.split(',')] + html_parts = [] + + for selector in selectors: + try: + content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''") + html_parts.append(content) + except Error as e: + print(f"Warning: Could not get content for selector '{selector}': {str(e)}") + + # Wrap in a div to create a valid HTML structure + html = f"
\n" + "\n".join(html_parts) + "\n
" + except Error as e: + raise RuntimeError(f"Failed to extract HTML content: {str(e)}") + else: + html = await page.content() + + # # Get final HTML content + # html = await page.content() await self.execute_hook( "before_return_html", page=page, html=html, context=context, config=config ) diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 02b377e6..e9a9daf1 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -4,10 +4,10 @@ import aiosqlite import asyncio from typing import Optional, Dict from contextlib import asynccontextmanager -import logging import json # Added for serialization/deserialization from .utils import ensure_content_dirs, generate_content_hash from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown +# , StringCompatibleMarkdown import aiofiles from .utils import VersionManager from .async_logger import AsyncLogger diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index 56c4d567..b587d011 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -13,7 +13,7 @@ from rich.live import Live from rich.table import Table from rich.console import Console from rich import box -from datetime import timedelta +from datetime import timedelta, datetime from collections.abc import AsyncGenerator import time import psutil @@ -24,6 +24,8 @@ from urllib.parse import urlparse import random from abc import ABC, abstractmethod +from math import inf as infinity + class RateLimiter: def __init__( @@ -250,7 +252,7 @@ class CrawlerMonitor: key=lambda x: ( x.status != CrawlStatus.IN_PROGRESS, x.status != CrawlStatus.QUEUED, - x.end_time or float('inf'), + x.end_time or infinity, ), )[: self.max_visible_rows] diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 1060fdcf..430e26a0 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -4,7 +4,7 @@ import sys import time from colorama import Fore from pathlib import Path -from typing import Optional, List +from typing import Optional, List, Generic, TypeVar import json import asyncio @@ -23,7 +23,7 @@ from .async_crawler_strategy import ( AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse, ) -from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode +from .cache_context import CacheMode, CacheContext from .markdown_generation_strategy import ( DefaultMarkdownGenerator, MarkdownGenerationStrategy, @@ -44,17 +44,46 @@ from .utils import ( RobotsParser, ) -from typing import Union, AsyncGenerator, TypeVar +from typing import Union, AsyncGenerator CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) -RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] +# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] -DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] -DeepCrawlManyReturn = Union[ - List[List[CrawlResultT]], - AsyncGenerator[CrawlResultT, None], +class CrawlResultContainer(Generic[CrawlResultT]): + def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): + # Normalize to a list + if isinstance(results, list): + self._results = results + else: + self._results = [results] + + def __iter__(self): + return iter(self._results) + + def __getitem__(self, index): + return self._results[index] + + def __len__(self): + return len(self._results) + + def __getattr__(self, attr): + # Delegate attribute access to the first element. + if self._results: + return getattr(self._results[0], attr) + raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") + + def __repr__(self): + return f"{self.__class__.__name__}({self._results!r})" + +# Redefine the union type. Now synchronous calls always return a container, +# while stream mode is handled with an AsyncGenerator. +RunManyReturn = Union[ + CrawlResultContainer[CrawlResultT], + AsyncGenerator[CrawlResultT, None] ] + + class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. @@ -222,24 +251,7 @@ class AsyncWebCrawler: async def arun( self, url: str, - config: Optional[CrawlerRunConfig] = None, - # Legacy parameters maintained for backwards compatibility - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - content_filter: RelevantContentFilter = None, - cache_mode: Optional[CacheMode] = None, - # Deprecated cache parameters - bypass_cache: bool = False, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, - # Other legacy parameters - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - user_agent: str = None, - verbose=True, + config: CrawlerRunConfig = None, **kwargs, ) -> RunManyReturn: """ @@ -270,45 +282,13 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ - crawler_config = config + config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") async with self._lock or self.nullcontext(): try: - # Handle configuration - if crawler_config is not None: - config = crawler_config - else: - # Merge all parameters into a single kwargs dict for config creation - config_kwargs = { - "word_count_threshold": word_count_threshold, - "extraction_strategy": extraction_strategy, - "chunking_strategy": chunking_strategy, - "content_filter": content_filter, - "cache_mode": cache_mode, - "bypass_cache": bypass_cache, - "disable_cache": disable_cache, - "no_cache_read": no_cache_read, - "no_cache_write": no_cache_write, - "css_selector": css_selector, - "screenshot": screenshot, - "pdf": pdf, - "verbose": verbose, - **kwargs, - } - config = CrawlerRunConfig.from_kwargs(config_kwargs) - - # Handle deprecated cache parameters - if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - # Convert legacy parameters if cache_mode not provided - if config.cache_mode is None: - config.cache_mode = _legacy_to_cache_mode( - disable_cache=disable_cache, - bypass_cache=bypass_cache, - no_cache_read=no_cache_read, - no_cache_write=no_cache_write, - ) + self.logger.verbose = config.verbose # Default to ENABLED if no cache mode specified if config.cache_mode is None: @@ -344,7 +324,11 @@ class AsyncWebCrawler: # If screenshot is requested but its not in cache, then set cache_result to None screenshot_data = cached_result.screenshot pdf_data = cached_result.pdf - if config.screenshot and not screenshot or config.pdf and not pdf: + # if config.screenshot and not screenshot or config.pdf and not pdf: + if config.screenshot and not screenshot_data: + cached_result = None + + if config.pdf and not pdf_data: cached_result = None self.logger.url_status( @@ -358,12 +342,11 @@ class AsyncWebCrawler: if config and config.proxy_rotation_strategy: next_proxy = await config.proxy_rotation_strategy.get_next_proxy() if next_proxy: - if verbose: - self.logger.info( - message="Switch proxy: {proxy}", - tag="PROXY", - params={"proxy": next_proxy.server}, - ) + self.logger.info( + message="Switch proxy: {proxy}", + tag="PROXY", + params={"proxy": next_proxy.server}, + ) config.proxy_config = next_proxy # config = config.clone(proxy_config=next_proxy) @@ -371,8 +354,8 @@ class AsyncWebCrawler: if not cached_result or not html: t1 = time.perf_counter() - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) + if config.user_agent: + self.crawler_strategy.update_user_agent(config.user_agent) # Check robots.txt if enabled if config and config.check_robots_txt: @@ -452,7 +435,7 @@ class AsyncWebCrawler: if cache_context.should_write() and not bool(cached_result): await async_db_manager.acache_url(crawl_result) - return crawl_result + return CrawlResultContainer(crawl_result) else: self.logger.success( @@ -469,7 +452,7 @@ class AsyncWebCrawler: cached_result.success = bool(html) cached_result.session_id = getattr(config, "session_id", None) cached_result.redirected_url = cached_result.redirected_url or url - return cached_result + return CrawlResultContainer(cached_result) except Exception as e: error_context = get_error_context(sys.exc_info()) @@ -487,8 +470,10 @@ class AsyncWebCrawler: tag="ERROR", ) - return CrawlResult( - url=url, html="", success=False, error_message=error_message + return CrawlResultContainer( + CrawlResult( + url=url, html="", success=False, error_message=error_message + ) ) async def aprocess_html( @@ -529,7 +514,8 @@ class AsyncWebCrawler: scraping_strategy.logger = self.logger # Process HTML content - params = {k: v for k, v in config.to_dict().items() if k not in ["url"]} + params = config.__dict__.copy() + params.pop("url", None) # add keys from kwargs to params that doesn't exist in params params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) @@ -584,9 +570,9 @@ class AsyncWebCrawler: # Log processing completion self.logger.info( - message="Processed {url:.50}... | Time: {timing}ms", + message="{url:.50}... | Time: {timing}s", tag="SCRAPE", - params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)}, + params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, ) ################################ @@ -664,17 +650,17 @@ class AsyncWebCrawler: config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - content_filter: RelevantContentFilter = None, - cache_mode: Optional[CacheMode] = None, - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - user_agent: str = None, - verbose=True, + # word_count_threshold=MIN_WORD_THRESHOLD, + # extraction_strategy: ExtractionStrategy = None, + # chunking_strategy: ChunkingStrategy = RegexChunking(), + # content_filter: RelevantContentFilter = None, + # cache_mode: Optional[CacheMode] = None, + # bypass_cache: bool = False, + # css_selector: str = None, + # screenshot: bool = False, + # pdf: bool = False, + # user_agent: str = None, + # verbose=True, **kwargs ) -> RunManyReturn: """ @@ -707,20 +693,21 @@ class AsyncWebCrawler: ): print(f"Processed {result.url}: {len(result.markdown)} chars") """ - if config is None: - config = CrawlerRunConfig( - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - content_filter=content_filter, - cache_mode=cache_mode, - bypass_cache=bypass_cache, - css_selector=css_selector, - screenshot=screenshot, - pdf=pdf, - verbose=verbose, - **kwargs, - ) + config = config or CrawlerRunConfig() + # if config is None: + # config = CrawlerRunConfig( + # word_count_threshold=word_count_threshold, + # extraction_strategy=extraction_strategy, + # chunking_strategy=chunking_strategy, + # content_filter=content_filter, + # cache_mode=cache_mode, + # bypass_cache=bypass_cache, + # css_selector=css_selector, + # screenshot=screenshot, + # pdf=pdf, + # verbose=verbose, + # **kwargs, + # ) if dispatcher is None: dispatcher = MemoryAdaptiveDispatcher( diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 29c2ba1b..38f87d9a 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -74,6 +74,7 @@ class ManagedBrowser: _get_browser_args(): Returns browser-specific command line arguments. _get_user_data_dir(): Returns the user data directory path. _cleanup(): Terminates the browser process and removes the temporary directory. + create_profile(): Static method to create a user profile by launching a browser for user interaction. """ browser_type: str @@ -288,6 +289,80 @@ class ManagedBrowser: tag="ERROR", params={"error": str(e)}, ) + + # These methods have been moved to BrowserProfiler class + @staticmethod + async def create_profile(browser_config=None, profile_name=None, logger=None): + """ + This method has been moved to the BrowserProfiler class. + + Creates a browser profile by launching a browser for interactive user setup + and waits until the user closes it. The profile is stored in a directory that + can be used later with BrowserConfig.user_data_dir. + + Please use BrowserProfiler.create_profile() instead. + + Example: + ```python + from crawl4ai.browser_profiler import BrowserProfiler + + profiler = BrowserProfiler() + profile_path = await profiler.create_profile(profile_name="my-login-profile") + ``` + """ + from .browser_profiler import BrowserProfiler + + # Create a BrowserProfiler instance and delegate to it + profiler = BrowserProfiler(logger=logger) + return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config) + + @staticmethod + def list_profiles(): + """ + This method has been moved to the BrowserProfiler class. + + Lists all available browser profiles in the Crawl4AI profiles directory. + + Please use BrowserProfiler.list_profiles() instead. + + Example: + ```python + from crawl4ai.browser_profiler import BrowserProfiler + + profiler = BrowserProfiler() + profiles = profiler.list_profiles() + ``` + """ + from .browser_profiler import BrowserProfiler + + # Create a BrowserProfiler instance and delegate to it + profiler = BrowserProfiler() + return profiler.list_profiles() + + @staticmethod + def delete_profile(profile_name_or_path): + """ + This method has been moved to the BrowserProfiler class. + + Delete a browser profile by name or path. + + Please use BrowserProfiler.delete_profile() instead. + + Example: + ```python + from crawl4ai.browser_profiler import BrowserProfiler + + profiler = BrowserProfiler() + success = profiler.delete_profile("my-profile") + ``` + """ + from .browser_profiler import BrowserProfiler + + # Create a BrowserProfiler instance and delegate to it + profiler = BrowserProfiler() + return profiler.delete_profile(profile_name_or_path) + + class BrowserManager: @@ -304,6 +379,7 @@ class BrowserManager: sessions (dict): Dictionary to store session information session_ttl (int): Session timeout in seconds """ + def __init__(self, browser_config: BrowserConfig, logger=None): """ @@ -358,8 +434,9 @@ class BrowserManager: self.playwright = await async_playwright().start() - if self.config.use_managed_browser: - cdp_url = await self.managed_browser.start() + if self.config.cdp_url or self.config.use_managed_browser: + self.config.use_managed_browser = True + cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) contexts = self.browser.contexts if contexts: @@ -454,9 +531,9 @@ class BrowserManager: ProxySettings(server=self.config.proxy) if self.config.proxy else ProxySettings( - server=self.config.proxy_config.get("server"), - username=self.config.proxy_config.get("username"), - password=self.config.proxy_config.get("password"), + server=self.config.proxy_config.server, + username=self.config.proxy_config.username, + password=self.config.proxy_config.password, ) ) browser_args["proxy"] = proxy_settings @@ -714,7 +791,10 @@ class BrowserManager: # If using a managed browser, just grab the shared default_context if self.config.use_managed_browser: context = self.default_context - page = await context.new_page() + pages = context.pages + page = next((p for p in pages if p.url == crawlerRunConfig.url), None) + if not page: + page = await context.new_page() else: # Otherwise, check if we have an existing context for this config config_signature = self._make_config_signature(crawlerRunConfig) @@ -764,6 +844,9 @@ class BrowserManager: async def close(self): """Close all browser resources and clean up.""" + if self.config.cdp_url: + return + if self.config.sleep_on_close: await asyncio.sleep(0.5) diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py new file mode 100644 index 00000000..be3274b7 --- /dev/null +++ b/crawl4ai/browser_profiler.py @@ -0,0 +1,769 @@ +""" +Browser Profiler Module + +This module provides a dedicated class for managing browser profiles +that can be used for identity-based crawling with Crawl4AI. +""" + +import os +import asyncio +import signal +import sys +import datetime +import uuid +import shutil +from typing import List, Dict, Optional, Any +from colorama import Fore, Style, init + +from .async_configs import BrowserConfig +from .browser_manager import ManagedBrowser +from .async_logger import AsyncLogger, AsyncLoggerBase +from .utils import get_home_folder + + +class BrowserProfiler: + """ + A dedicated class for managing browser profiles for Crawl4AI. + + The BrowserProfiler allows you to: + - Create browser profiles interactively + - List available profiles + - Delete profiles when no longer needed + - Get profile paths for use in BrowserConfig + + Profiles are stored by default in ~/.crawl4ai/profiles/ + """ + + def __init__(self, logger: Optional[AsyncLoggerBase] = None): + """ + Initialize the BrowserProfiler. + + Args: + logger (AsyncLoggerBase, optional): Logger for outputting messages. + If None, a default AsyncLogger will be created. + """ + # Initialize colorama for colorful terminal output + init() + + # Create a logger if not provided + if logger is None: + self.logger = AsyncLogger(verbose=True) + elif not isinstance(logger, AsyncLoggerBase): + self.logger = AsyncLogger(verbose=True) + else: + self.logger = logger + + # Ensure profiles directory exists + self.profiles_dir = os.path.join(get_home_folder(), "profiles") + os.makedirs(self.profiles_dir, exist_ok=True) + + async def create_profile(self, + profile_name: Optional[str] = None, + browser_config: Optional[BrowserConfig] = None) -> Optional[str]: + """ + Creates a browser profile by launching a browser for interactive user setup + and waits until the user closes it. The profile is stored in a directory that + can be used later with BrowserConfig.user_data_dir. + + Args: + profile_name (str, optional): Name for the profile directory. + If None, a name is generated based on timestamp. + browser_config (BrowserConfig, optional): Configuration for the browser. + If None, a default configuration is used with headless=False. + + Returns: + str: Path to the created profile directory, or None if creation failed + + Example: + ```python + profiler = BrowserProfiler() + + # Create a profile interactively + profile_path = await profiler.create_profile( + profile_name="my-login-profile" + ) + + # Use the profile in a crawler + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir=profile_path + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # The crawler will now use your profile with all your cookies and login state + result = await crawler.arun("https://example.com/dashboard") + ``` + """ + # Create default browser config if none provided + if browser_config is None: + from .async_configs import BrowserConfig + browser_config = BrowserConfig( + browser_type="chromium", + headless=False, # Must be visible for user interaction + verbose=True + ) + else: + # Ensure headless is False for user interaction + browser_config.headless = False + + # Generate profile name if not provided + if not profile_name: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}" + + # Sanitize profile name (replace spaces and special chars) + profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name) + + # Set user data directory + profile_path = os.path.join(self.profiles_dir, profile_name) + os.makedirs(profile_path, exist_ok=True) + + # Print instructions for the user with colorama formatting + border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" + self.logger.info(f"\n{border}", tag="PROFILE") + self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE") + self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + self.logger.info("\nInstructions:", tag="PROFILE") + self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE") + self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE") + self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE") + self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE") + self.logger.info(f"{border}\n", tag="PROFILE") + + # Create managed browser instance + managed_browser = ManagedBrowser( + browser_type=browser_config.browser_type, + user_data_dir=profile_path, + headless=False, # Must be visible + logger=self.logger, + debugging_port=browser_config.debugging_port + ) + + # Set up signal handlers to ensure cleanup on interrupt + original_sigint = signal.getsignal(signal.SIGINT) + original_sigterm = signal.getsignal(signal.SIGTERM) + + # Define cleanup handler for signals + async def cleanup_handler(sig, frame): + self.logger.warning("\nCleaning up browser process...", tag="PROFILE") + await managed_browser.cleanup() + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + if sig == signal.SIGINT: + self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE") + sys.exit(1) + + # Set signal handlers + def sigint_handler(sig, frame): + asyncio.create_task(cleanup_handler(sig, frame)) + + signal.signal(signal.SIGINT, sigint_handler) + signal.signal(signal.SIGTERM, sigint_handler) + + # Event to signal when user is done with the browser + user_done_event = asyncio.Event() + + # Run keyboard input loop in a separate task + async def listen_for_quit_command(): + import termios + import tty + import select + + # First output the prompt + self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE") + + # Save original terminal settings + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + + try: + # Switch to non-canonical mode (no line buffering) + tty.setcbreak(fd) + + while True: + # Check if input is available (non-blocking) + readable, _, _ = select.select([sys.stdin], [], [], 0.5) + if readable: + key = sys.stdin.read(1) + if key.lower() == 'q': + self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE") + user_done_event.set() + return + + # Check if the browser process has already exited + if managed_browser.browser_process and managed_browser.browser_process.poll() is not None: + self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE") + user_done_event.set() + return + + await asyncio.sleep(0.1) + + finally: + # Restore terminal settings + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + + try: + # Start the browser + await managed_browser.start() + + # Check if browser started successfully + browser_process = managed_browser.browser_process + if not browser_process: + self.logger.error("Failed to start browser process.", tag="PROFILE") + return None + + self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE") + + # Start listening for keyboard input + listener_task = asyncio.create_task(listen_for_quit_command()) + + # Wait for either the user to press 'q' or for the browser process to exit naturally + while not user_done_event.is_set() and browser_process.poll() is None: + await asyncio.sleep(0.5) + + # Cancel the listener task if it's still running + if not listener_task.done(): + listener_task.cancel() + try: + await listener_task + except asyncio.CancelledError: + pass + + # If the browser is still running and the user pressed 'q', terminate it + if browser_process.poll() is None and user_done_event.is_set(): + self.logger.info("Terminating browser process...", tag="PROFILE") + await managed_browser.cleanup() + + self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + except Exception as e: + self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE") + await managed_browser.cleanup() + return None + finally: + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + + # Make sure browser is fully cleaned up + await managed_browser.cleanup() + + # Return the profile path + return profile_path + + def list_profiles(self) -> List[Dict[str, Any]]: + """ + Lists all available browser profiles in the Crawl4AI profiles directory. + + Returns: + list: A list of dictionaries containing profile information: + [{"name": "profile_name", "path": "/path/to/profile", "created": datetime, "type": "chromium|firefox"}] + + Example: + ```python + profiler = BrowserProfiler() + + # List all available profiles + profiles = profiler.list_profiles() + + for profile in profiles: + print(f"Profile: {profile['name']}") + print(f" Path: {profile['path']}") + print(f" Created: {profile['created']}") + print(f" Browser type: {profile['type']}") + ``` + """ + if not os.path.exists(self.profiles_dir): + return [] + + profiles = [] + + for name in os.listdir(self.profiles_dir): + profile_path = os.path.join(self.profiles_dir, name) + + # Skip if not a directory + if not os.path.isdir(profile_path): + continue + + # Check if this looks like a valid browser profile + # For Chromium: Look for Preferences file + # For Firefox: Look for prefs.js file + is_valid = False + + if os.path.exists(os.path.join(profile_path, "Preferences")) or \ + os.path.exists(os.path.join(profile_path, "Default", "Preferences")): + is_valid = "chromium" + elif os.path.exists(os.path.join(profile_path, "prefs.js")): + is_valid = "firefox" + + if is_valid: + # Get creation time + created = datetime.datetime.fromtimestamp( + os.path.getctime(profile_path) + ) + + profiles.append({ + "name": name, + "path": profile_path, + "created": created, + "type": is_valid + }) + + # Sort by creation time, newest first + profiles.sort(key=lambda x: x["created"], reverse=True) + + return profiles + + def get_profile_path(self, profile_name: str) -> Optional[str]: + """ + Get the full path to a profile by name. + + Args: + profile_name (str): Name of the profile (not the full path) + + Returns: + str: Full path to the profile directory, or None if not found + + Example: + ```python + profiler = BrowserProfiler() + + path = profiler.get_profile_path("my-profile") + if path: + print(f"Profile path: {path}") + else: + print("Profile not found") + ``` + """ + profile_path = os.path.join(self.profiles_dir, profile_name) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + # Chrck if profile_name itself is full path + if os.path.isabs(profile_name): + profile_path = profile_name + else: + return None + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return None # Not a valid browser profile + + return profile_path + + def delete_profile(self, profile_name_or_path: str) -> bool: + """ + Delete a browser profile by name or path. + + Args: + profile_name_or_path (str): Name of the profile or full path to profile directory + + Returns: + bool: True if the profile was deleted successfully, False otherwise + + Example: + ```python + profiler = BrowserProfiler() + + # Delete by name + success = profiler.delete_profile("my-profile") + + # Delete by path + success = profiler.delete_profile("/path/to/.crawl4ai/profiles/my-profile") + ``` + """ + # Determine if input is a name or a path + if os.path.isabs(profile_name_or_path): + # Full path provided + profile_path = profile_name_or_path + else: + # Just a name provided, construct path + profile_path = os.path.join(self.profiles_dir, profile_name_or_path) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + return False + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return False # Not a valid browser profile + + # Delete the profile directory + try: + shutil.rmtree(profile_path) + return True + except Exception: + return False + + async def interactive_manager(self, crawl_callback=None): + """ + Launch an interactive profile management console. + + Args: + crawl_callback (callable, optional): Function to call when selecting option to use + a profile for crawling. It will be called with (profile_path, url). + + Example: + ```python + profiler = BrowserProfiler() + + # Define a custom crawl function + async def my_crawl_function(profile_path, url): + print(f"Crawling {url} with profile {profile_path}") + # Implement your crawling logic here + + # Start interactive manager + await profiler.interactive_manager(crawl_callback=my_crawl_function) + ``` + """ + while True: + self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU") + + # Only show crawl option if callback provided + if crawl_callback: + self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "5" + else: + self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "4" + + choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}") + + if choice == "1": + # Create new profile + name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}") + await self.create_profile(name or None) + + elif choice == "2": + # List profiles + profiles = self.list_profiles() + + if not profiles: + self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES") + continue + + # Print profile information with colorama formatting + self.logger.info("\nAvailable profiles:", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES") + self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES") + self.logger.info("", tag="PROFILES") # Empty line for spacing + + elif choice == "3": + # Delete profile + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found to delete", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to delete + profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_name = profiles[idx]["name"] + self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + + # Confirm deletion + confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}") + if confirm.lower() == 'y': + success = self.delete_profile(profiles[idx]["path"]) + + if success: + self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES") + else: + self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif choice == "4" and crawl_callback: + # Use profile to crawl a site + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found. Create one first.", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to use + profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_path = profiles[idx]["path"] + url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}") + if url: + # Call the provided crawl callback + await crawl_callback(profile_path, url) + else: + self.logger.error("No URL provided", tag="CRAWL") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif choice == exit_option: + # Exit + self.logger.info("Exiting profile management", tag="MENU") + break + + else: + self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") + + + async def launch_standalone_browser(self, + browser_type: str = "chromium", + user_data_dir: Optional[str] = None, + debugging_port: int = 9222, + headless: bool = False) -> Optional[str]: + """ + Launch a standalone browser with CDP debugging enabled and keep it running + until the user presses 'q'. Returns and displays the CDP URL. + + Args: + browser_type (str): Type of browser to launch ('chromium' or 'firefox') + user_data_dir (str, optional): Path to user profile directory + debugging_port (int): Port to use for CDP debugging + headless (bool): Whether to run in headless mode + + Returns: + str: CDP URL for the browser, or None if launch failed + + Example: + ```python + profiler = BrowserProfiler() + cdp_url = await profiler.launch_standalone_browser( + user_data_dir="/path/to/profile", + debugging_port=9222 + ) + # Use cdp_url to connect to the browser + ``` + """ + # Use the provided directory if specified, otherwise create a temporary directory + if user_data_dir: + # Directory is provided directly, ensure it exists + profile_path = user_data_dir + os.makedirs(profile_path, exist_ok=True) + else: + # Create a temporary profile directory + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + profile_name = f"temp_{timestamp}_{uuid.uuid4().hex[:6]}" + profile_path = os.path.join(self.profiles_dir, profile_name) + os.makedirs(profile_path, exist_ok=True) + + # Print initial information + border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" + self.logger.info(f"\n{border}", tag="CDP") + self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP") + self.logger.info(f"Browser type: {Fore.GREEN}{browser_type}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Profile path: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Debugging port: {Fore.CYAN}{debugging_port}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Headless mode: {Fore.CYAN}{headless}{Style.RESET_ALL}", tag="CDP") + + # Create managed browser instance + managed_browser = ManagedBrowser( + browser_type=browser_type, + user_data_dir=profile_path, + headless=headless, + logger=self.logger, + debugging_port=debugging_port + ) + + # Set up signal handlers to ensure cleanup on interrupt + original_sigint = signal.getsignal(signal.SIGINT) + original_sigterm = signal.getsignal(signal.SIGTERM) + + # Define cleanup handler for signals + async def cleanup_handler(sig, frame): + self.logger.warning("\nCleaning up browser process...", tag="CDP") + await managed_browser.cleanup() + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + if sig == signal.SIGINT: + self.logger.error("Browser terminated by user.", tag="CDP") + sys.exit(1) + + # Set signal handlers + def sigint_handler(sig, frame): + asyncio.create_task(cleanup_handler(sig, frame)) + + signal.signal(signal.SIGINT, sigint_handler) + signal.signal(signal.SIGTERM, sigint_handler) + + # Event to signal when user wants to exit + user_done_event = asyncio.Event() + + # Run keyboard input loop in a separate task + async def listen_for_quit_command(): + import termios + import tty + import select + + # First output the prompt + self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' to stop the browser and exit...{Style.RESET_ALL}", tag="CDP") + + # Save original terminal settings + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + + try: + # Switch to non-canonical mode (no line buffering) + tty.setcbreak(fd) + + while True: + # Check if input is available (non-blocking) + readable, _, _ = select.select([sys.stdin], [], [], 0.5) + if readable: + key = sys.stdin.read(1) + if key.lower() == 'q': + self.logger.info(f"{Fore.GREEN}Closing browser...{Style.RESET_ALL}", tag="CDP") + user_done_event.set() + return + + # Check if the browser process has already exited + if managed_browser.browser_process and managed_browser.browser_process.poll() is not None: + self.logger.info("Browser already closed. Ending input listener.", tag="CDP") + user_done_event.set() + return + + await asyncio.sleep(0.1) + + finally: + # Restore terminal settings + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + + # Function to retrieve and display CDP JSON config + async def get_cdp_json(port): + import aiohttp + cdp_url = f"http://localhost:{port}" + json_url = f"{cdp_url}/json/version" + + try: + async with aiohttp.ClientSession() as session: + # Try multiple times in case the browser is still starting up + for _ in range(10): + try: + async with session.get(json_url) as response: + if response.status == 200: + data = await response.json() + return cdp_url, data + except Exception: + pass + + await asyncio.sleep(0.5) + + return cdp_url, None + except Exception as e: + self.logger.error(f"Error fetching CDP JSON: {str(e)}", tag="CDP") + return cdp_url, None + + cdp_url = None + config_json = None + + try: + # Start the browser + await managed_browser.start() + + # Check if browser started successfully + browser_process = managed_browser.browser_process + if not browser_process: + self.logger.error("Failed to start browser process.", tag="CDP") + return None + + self.logger.info(f"Browser launched successfully. Retrieving CDP information...", tag="CDP") + + # Get CDP URL and JSON config + cdp_url, config_json = await get_cdp_json(debugging_port) + + if cdp_url: + self.logger.success(f"CDP URL: {Fore.GREEN}{cdp_url}{Style.RESET_ALL}", tag="CDP") + + if config_json: + # Display relevant CDP information + self.logger.info(f"Browser: {Fore.CYAN}{config_json.get('Browser', 'Unknown')}{Style.RESET_ALL}", tag="CDP") + self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP") + if 'webSocketDebuggerUrl' in config_json: + self.logger.info(f"WebSocket URL: {Fore.GREEN}{config_json['webSocketDebuggerUrl']}{Style.RESET_ALL}", tag="CDP") + else: + self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP") + else: + self.logger.error(f"Failed to get CDP URL on port {debugging_port}", tag="CDP") + await managed_browser.cleanup() + return None + + # Start listening for keyboard input + listener_task = asyncio.create_task(listen_for_quit_command()) + + # Wait for the user to press 'q' or for the browser process to exit naturally + while not user_done_event.is_set() and browser_process.poll() is None: + await asyncio.sleep(0.5) + + # Cancel the listener task if it's still running + if not listener_task.done(): + listener_task.cancel() + try: + await listener_task + except asyncio.CancelledError: + pass + + # If the browser is still running and the user pressed 'q', terminate it + if browser_process.poll() is None and user_done_event.is_set(): + self.logger.info("Terminating browser process...", tag="CDP") + await managed_browser.cleanup() + + self.logger.success(f"Browser closed.", tag="CDP") + + except Exception as e: + self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP") + await managed_browser.cleanup() + return None + finally: + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + + # Make sure browser is fully cleaned up + await managed_browser.cleanup() + + # Return the CDP URL + return cdp_url + + diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index aabf6c0c..659bf2b3 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1,9 +1,18 @@ import click import os -from typing import Dict, Any, Optional +import sys +import time + +import humanize +from typing import Dict, Any, Optional, List import json import yaml import anyio +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +from rich.prompt import Prompt, Confirm + from crawl4ai import ( CacheMode, AsyncWebCrawler, @@ -14,12 +23,16 @@ from crawl4ai import ( JsonCssExtractionStrategy, JsonXPathExtractionStrategy, BM25ContentFilter, - PruningContentFilter + PruningContentFilter, + BrowserProfiler, + LLMConfig ) from litellm import completion from pathlib import Path -from crawl4ai.async_configs import LlmConfig + +# Initialize rich console +console = Console() def get_global_config() -> dict: config_dir = Path.home() / ".crawl4ai" @@ -172,7 +185,38 @@ def show_examples(): # Crawler settings crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true" -4️⃣ Sample Config Files: +4️⃣ Profile Management for Identity-Based Crawling: + # Launch interactive profile manager + crwl profiles + + # Create, list, and delete browser profiles for identity-based crawling + # Use a profile for crawling (keeps you logged in) + crwl https://example.com -p my-profile-name + + # Example: Crawl a site that requires login + # 1. First create a profile and log in: + crwl profiles + # 2. Then use that profile to crawl the authenticated site: + crwl https://site-requiring-login.com/dashboard -p my-profile-name + +5️⃣ CDP Mode for Browser Automation: + # Launch browser with CDP debugging on default port 9222 + crwl cdp + + # Use a specific profile and custom port + crwl cdp -p my-profile -P 9223 + + # Launch headless browser with CDP enabled + crwl cdp --headless + + # Launch in incognito mode (ignores profile) + crwl cdp --incognito + + # Use the CDP URL with other tools (Puppeteer, Playwright, etc.) + # The URL will be displayed in the terminal when the browser starts + + +6️⃣ Sample Config Files: browser.yml: headless: true @@ -230,7 +274,7 @@ llm_schema.json: } } -5️⃣ Advanced Usage: +7️⃣ Advanced Usage: # Combine configs with direct parameters crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920" @@ -248,9 +292,15 @@ llm_schema.json: -f filter_bm25.yml \\ -o markdown-fit + # Authenticated crawling with profile + crwl https://login-required-site.com \\ + -p my-authenticated-profile \\ + -c "css_selector=.dashboard-content" \\ + -o markdown + For more documentation visit: https://github.com/unclecode/crawl4ai -6️⃣ Q&A with LLM: +8️⃣ Q&A with LLM: # Ask a question about the content crwl https://example.com -q "What is the main topic discussed?" @@ -277,12 +327,331 @@ For more documentation visit: https://github.com/unclecode/crawl4ai - google/gemini-pro See full list of providers: https://docs.litellm.ai/docs/providers + +9️⃣ Profile Management: + # Launch interactive profile manager + crwl profiles + + # Create a profile and use it for crawling + crwl profiles # Create and set up your profile interactively + crwl https://example.com -p my-profile-name # Use profile for crawling + + # Example workflow for authenticated site + # 1. First create a profile and log in to the site: + crwl profiles # Select "Create new profile" option + # 2. Then use that profile to crawl authenticated content: + crwl https://site-requiring-login.com/dashboard -p my-profile-name """ click.echo(examples) -@click.command(context_settings={"help_option_names": ["-h", "--help"]}) -@click.argument("url", required=False) -@click.option("--example", is_flag=True, help="Show usage examples") +def get_directory_size(path: str) -> int: + """Calculate the total size of a directory in bytes""" + total_size = 0 + for dirpath, _, filenames in os.walk(path): + for f in filenames: + fp = os.path.join(dirpath, f) + if not os.path.islink(fp): + total_size += os.path.getsize(fp) + return total_size + +def display_profiles_table(profiles: List[Dict[str, Any]]): + """Display a rich table of browser profiles""" + if not profiles: + console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", + title="Browser Profiles", border_style="blue")) + return + + table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue") + table.add_column("#", style="dim", width=4) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Path", style="green") + table.add_column("Created", style="yellow") + table.add_column("Browser", style="magenta") + table.add_column("Size", style="blue", justify="right") + + for i, profile in enumerate(profiles): + # Calculate folder size + size = get_directory_size(profile["path"]) + human_size = humanize.naturalsize(size) + + # Format creation date + created = profile["created"].strftime("%Y-%m-%d %H:%M") + + # Add row to table + table.add_row( + str(i+1), + profile["name"], + profile["path"], + created, + profile["type"].capitalize(), + human_size + ) + + console.print(table) + +async def create_profile_interactive(profiler: BrowserProfiler): + """Interactive profile creation wizard""" + console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n" + "This will open a browser window for you to set up your identity.\n" + "Log in to sites, adjust settings, then press 'q' to save.", + border_style="cyan")) + + profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}") + + console.print("[cyan]Creating profile...[/cyan]") + console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]") + + # Create the profile + try: + profile_path = await profiler.create_profile(profile_name) + + if profile_path: + console.print(f"[green]Profile successfully created at:[/green] {profile_path}") + else: + console.print("[red]Failed to create profile.[/red]") + except Exception as e: + console.print(f"[red]Error creating profile: {str(e)}[/red]") + +def delete_profile_interactive(profiler: BrowserProfiler): + """Interactive profile deletion""" + profiles = profiler.list_profiles() + + if not profiles: + console.print("[yellow]No profiles found to delete.[/yellow]") + return + + # Display profiles + display_profiles_table(profiles) + + # Get profile selection + idx = Prompt.ask( + "[red]Enter number of profile to delete[/red]", + console=console, + choices=[str(i+1) for i in range(len(profiles))], + show_choices=False + ) + + try: + idx = int(idx) - 1 + profile = profiles[idx] + + # Confirm deletion + if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"): + success = profiler.delete_profile(profile["path"]) + + if success: + console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]") + else: + console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]") + except (ValueError, IndexError): + console.print("[red]Invalid selection.[/red]") + +async def crawl_with_profile_cli(profile_path, url): + """Use a profile to crawl a website via CLI""" + console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]") + + # Create browser config with the profile + browser_cfg = BrowserConfig( + headless=False, # Set to False to see the browser in action + use_managed_browser=True, + user_data_dir=profile_path + ) + + # Default crawler config + crawler_cfg = CrawlerRunConfig() + + # Ask for output format + output_format = Prompt.ask( + "[cyan]Output format[/cyan]", + choices=["all", "json", "markdown", "md", "title"], + default="markdown" + ) + + try: + # Run the crawler + result = await run_crawler(url, browser_cfg, crawler_cfg, True) + + # Handle output + if output_format == "all": + console.print(json.dumps(result.model_dump(), indent=2)) + elif output_format == "json": + console.print(json.dumps(json.loads(result.extracted_content), indent=2)) + elif output_format in ["markdown", "md"]: + console.print(result.markdown.raw_markdown) + elif output_format == "title": + console.print(result.metadata.get("title", "No title found")) + + console.print(f"[green]Successfully crawled[/green] {url}") + return result + except Exception as e: + console.print(f"[red]Error crawling:[/red] {str(e)}") + return None + +async def use_profile_to_crawl(): + """Interactive profile selection for crawling""" + profiler = BrowserProfiler() + profiles = profiler.list_profiles() + + if not profiles: + console.print("[yellow]No profiles found. Create one first.[/yellow]") + return + + # Display profiles + display_profiles_table(profiles) + + # Get profile selection + idx = Prompt.ask( + "[cyan]Enter number of profile to use[/cyan]", + console=console, + choices=[str(i+1) for i in range(len(profiles))], + show_choices=False + ) + + try: + idx = int(idx) - 1 + profile = profiles[idx] + + # Get URL + url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]") + if url: + # Crawl with the selected profile + await crawl_with_profile_cli(profile["path"], url) + else: + console.print("[red]No URL provided[/red]") + except (ValueError, IndexError): + console.print("[red]Invalid selection[/red]") + +async def manage_profiles(): + """Interactive profile management menu""" + profiler = BrowserProfiler() + + options = { + "1": "List profiles", + "2": "Create new profile", + "3": "Delete profile", + "4": "Use a profile to crawl a website", + "5": "Exit", + } + + while True: + console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan")) + + for key, value in options.items(): + color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan" + console.print(f"[{color}]{key}[/{color}]. {value}") + + choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1") + + if choice == "1": + # List profiles + profiles = profiler.list_profiles() + display_profiles_table(profiles) + + elif choice == "2": + # Create profile + await create_profile_interactive(profiler) + + elif choice == "3": + # Delete profile + delete_profile_interactive(profiler) + + elif choice == "4": + # Use profile to crawl + await use_profile_to_crawl() + + elif choice == "5": + # Exit + console.print("[cyan]Exiting profile manager.[/cyan]") + break + + # Add a separator between operations + console.print("\n") + + + +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) +def cli(): + """Crawl4AI CLI - Web content extraction and browser profile management tool""" + pass + + +@cli.command("cdp") +@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)") +@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)") +@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", + help="Browser type (default: chromium)") +@click.option("--headless", is_flag=True, help="Run browser in headless mode") +@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)") +def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool): + """Launch a standalone browser with CDP debugging enabled + + This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled, + prints the CDP URL, and keeps the browser running until you press 'q'. + + The CDP URL can be used for various automation and debugging tasks. + + Examples: + # Launch Chromium with CDP on default port 9222 + crwl cdp + + # Use a specific directory for browser data and custom port + crwl cdp --user-data-dir ~/browser-data --port 9223 + + # Launch in headless mode + crwl cdp --headless + + # Launch in incognito mode (ignores user-data-dir) + crwl cdp --incognito + """ + profiler = BrowserProfiler() + + try: + # Handle data directory + data_dir = None + if not incognito and user_data_dir: + # Expand user path (~/something) + expanded_path = os.path.expanduser(user_data_dir) + + # Create directory if it doesn't exist + if not os.path.exists(expanded_path): + console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]") + os.makedirs(expanded_path, exist_ok=True) + + data_dir = expanded_path + + # Print launch info + console.print(Panel( + f"[cyan]Launching browser with CDP debugging[/cyan]\n\n" + f"Browser type: [green]{browser_type}[/green]\n" + f"Debugging port: [yellow]{port}[/yellow]\n" + f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n" + f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n" + f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n" + f"[yellow]Press 'q' to quit when done[/yellow]", + title="CDP Browser", + border_style="cyan" + )) + + # Run the browser + cdp_url = anyio.run( + profiler.launch_standalone_browser, + browser_type, + data_dir, + port, + headless + ) + + if not cdp_url: + console.print("[red]Failed to launch browser or get CDP URL[/red]") + sys.exit(1) + + except Exception as e: + console.print(f"[red]Error launching CDP browser: {str(e)}[/red]") + sys.exit(1) + + +@cli.command("crawl") +@click.argument("url", required=True) @click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)") @click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)") @click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file") @@ -291,26 +660,44 @@ For more documentation visit: https://github.com/unclecode/crawl4ai @click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2") @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") -@click.option("--bypass-cache", is_flag=True, default = True, help="Bypass cache when crawling") +@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling") @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) -def cli(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, - extraction_config: str, schema: str, browser: Dict, crawler: Dict, - output: str, bypass_cache: bool, question: str, verbose: bool): - """Crawl4AI CLI - Web content extraction tool - +@click.option("--profile", "-p", help="Use a specific browser profile (by name)") +def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, + extraction_config: str, schema: str, browser: Dict, crawler: Dict, + output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + """Crawl a website and extract content + Simple Usage: - crwl https://example.com + crwl crawl https://example.com + """ - Run with --example to see detailed usage examples.""" - - if example: - show_examples() - return + # Handle profile option + if profile: + profiler = BrowserProfiler() + profile_path = profiler.get_profile_path(profile) - if not url: - raise click.UsageError("URL argument is required unless using --example") - + if not profile_path: + profiles = profiler.list_profiles() + + if profiles: + console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]") + display_profiles_table(profiles) + else: + console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]") + + return + + # Include the profile in browser config + if not browser: + browser = {} + browser["user_data_dir"] = profile_path + browser["use_managed_browser"] = True + + if verbose: + console.print(f"[green]Using browser profile:[/green] {profile}") + try: # Load base configurations browser_cfg = BrowserConfig.load(load_config_file(browser_config)) @@ -353,7 +740,7 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte raise click.ClickException("LLM provider and API token are required for LLM extraction") crawler_cfg.extraction_strategy = LLMExtractionStrategy( - llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]), + llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]), instruction=extract_conf["instruction"], schema=schema_data, **extract_conf.get("params", {}) @@ -401,5 +788,89 @@ def cli(url: str, example: bool, browser_config: str, crawler_config: str, filte except Exception as e: raise click.ClickException(str(e)) +@cli.command("examples") +def examples_cmd(): + """Show usage examples""" + show_examples() + +@cli.command("profiles") +def profiles_cmd(): + """Manage browser profiles interactively + + Launch an interactive browser profile manager where you can: + - List all existing profiles + - Create new profiles for authenticated browsing + - Delete unused profiles + """ + # Run interactive profile manager + anyio.run(manage_profiles) + +@cli.command(name="") +@click.argument("url", required=False) +@click.option("--example", is_flag=True, help="Show usage examples") +@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)") +@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)") +@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file") +@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file") +@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction") +@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2") +@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") +@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") +@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling") +@click.option("--question", "-q", help="Ask a question about the crawled content") +@click.option("--verbose", "-v", is_flag=True) +@click.option("--profile", "-p", help="Use a specific browser profile (by name)") +def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, + extraction_config: str, schema: str, browser: Dict, crawler: Dict, + output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + """Crawl4AI CLI - Web content extraction tool + + Simple Usage: + crwl https://example.com + + Run with --example to see detailed usage examples. + + Other commands: + crwl profiles - Manage browser profiles for identity-based crawling + crwl crawl - Crawl a website with advanced options + crwl cdp - Launch browser with CDP debugging enabled + crwl examples - Show more usage examples + """ + + if example: + show_examples() + return + + if not url: + # Show help without error message + ctx = click.get_current_context() + click.echo(ctx.get_help()) + return + + # Forward to crawl command + ctx = click.get_current_context() + ctx.invoke( + crawl_cmd, + url=url, + browser_config=browser_config, + crawler_config=crawler_config, + filter_config=filter_config, + extraction_config=extraction_config, + schema=schema, + browser=browser, + crawler=crawler, + output=output, + bypass_cache=bypass_cache, + question=question, + verbose=verbose, + profile=profile + ) + +def main(): + import sys + if len(sys.argv) < 2 or sys.argv[1] not in cli.commands: + sys.argv.insert(1, "crawl") + cli() + if __name__ == "__main__": - cli() \ No newline at end of file + main() \ No newline at end of file diff --git a/crawl4ai/configs/__init__.py b/crawl4ai/configs/__init__.py deleted file mode 100644 index b92adb35..00000000 --- a/crawl4ai/configs/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .proxy_config import ProxyConfig -__all__ = ["ProxyConfig"] \ No newline at end of file diff --git a/crawl4ai/configs/proxy_config.py b/crawl4ai/configs/proxy_config.py deleted file mode 100644 index c447c6bc..00000000 --- a/crawl4ai/configs/proxy_config.py +++ /dev/null @@ -1,113 +0,0 @@ -import os -from typing import Dict, List, Optional - - -class ProxyConfig: - def __init__( - self, - server: str, - username: Optional[str] = None, - password: Optional[str] = None, - ip: Optional[str] = None, - ): - """Configuration class for a single proxy. - - Args: - server: Proxy server URL (e.g., "http://127.0.0.1:8080") - username: Optional username for proxy authentication - password: Optional password for proxy authentication - ip: Optional IP address for verification purposes - """ - self.server = server - self.username = username - self.password = password - - # Extract IP from server if not explicitly provided - self.ip = ip or self._extract_ip_from_server() - - def _extract_ip_from_server(self) -> Optional[str]: - """Extract IP address from server URL.""" - try: - # Simple extraction assuming http://ip:port format - if "://" in self.server: - parts = self.server.split("://")[1].split(":") - return parts[0] - else: - parts = self.server.split(":") - return parts[0] - except Exception: - return None - - @staticmethod - def from_string(proxy_str: str) -> "ProxyConfig": - """Create a ProxyConfig from a string in the format 'ip:port:username:password'.""" - parts = proxy_str.split(":") - if len(parts) == 4: # ip:port:username:password - ip, port, username, password = parts - return ProxyConfig( - server=f"http://{ip}:{port}", - username=username, - password=password, - ip=ip - ) - elif len(parts) == 2: # ip:port only - ip, port = parts - return ProxyConfig( - server=f"http://{ip}:{port}", - ip=ip - ) - else: - raise ValueError(f"Invalid proxy string format: {proxy_str}") - - @staticmethod - def from_dict(proxy_dict: Dict) -> "ProxyConfig": - """Create a ProxyConfig from a dictionary.""" - return ProxyConfig( - server=proxy_dict.get("server"), - username=proxy_dict.get("username"), - password=proxy_dict.get("password"), - ip=proxy_dict.get("ip") - ) - - @staticmethod - def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]: - """Load proxies from environment variable. - - Args: - env_var: Name of environment variable containing comma-separated proxy strings - - Returns: - List of ProxyConfig objects - """ - proxies = [] - try: - proxy_list = os.getenv(env_var, "").split(",") - for proxy in proxy_list: - if not proxy: - continue - proxies.append(ProxyConfig.from_string(proxy)) - except Exception as e: - print(f"Error loading proxies from environment: {e}") - return proxies - - def to_dict(self) -> Dict: - """Convert to dictionary representation.""" - return { - "server": self.server, - "username": self.username, - "password": self.password, - "ip": self.ip - } - - def clone(self, **kwargs) -> "ProxyConfig": - """Create a copy of this configuration with updated values. - - Args: - **kwargs: Key-value pairs of configuration options to update - - Returns: - ProxyConfig: A new instance with the specified updates - """ - config_dict = self.to_dict() - config_dict.update(kwargs) - return ProxyConfig.from_dict(config_dict) diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 06c09eba..8d7a51b4 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -16,13 +16,13 @@ from .utils import ( extract_xml_data, merge_chunks, ) +from .types import LLMConfig +from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE from abc import ABC, abstractmethod import math from snowballstemmer import stemmer -from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE, PROVIDER_MODELS from .models import TokenUsage from .prompts import PROMPT_FILTER_CONTENT -import os import json import hashlib from pathlib import Path @@ -770,37 +770,56 @@ class PruningContentFilter(RelevantContentFilter): class LLMContentFilter(RelevantContentFilter): - """Content filtering using LLMs to generate relevant markdown.""" + """Content filtering using LLMs to generate relevant markdown. + + How it works: + 1. Extracts page metadata with fallbacks. + 2. Extracts text chunks from the body element. + 3. Applies LLMs to generate markdown for each chunk. + 4. Filters out chunks below the threshold. + 5. Sorts chunks by score in descending order. + 6. Returns the top N chunks. + + Attributes: + llm_config (LLMConfig): LLM configuration object. + instruction (str): Instruction for LLM markdown generation + chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9). + overlap_rate (float): Overlap rate for chunking (default: 0.5). + word_token_rate (float): Word token rate for chunking (default: 0.2). + verbose (bool): Enable verbose logging (default: False). + logger (AsyncLogger): Custom logger for LLM operations (optional). + """ _UNWANTED_PROPS = { - 'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")', - 'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")', - 'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")', - 'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")', + 'provider' : 'Instead, use llm_config=LLMConfig(provider="...")', + 'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")', + 'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")', + 'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")', } def __init__( self, - provider: str = DEFAULT_PROVIDER, - api_token: Optional[str] = None, - llmConfig: "LlmConfig" = None, + llm_config: "LLMConfig" = None, instruction: str = None, chunk_token_threshold: int = int(1e9), overlap_rate: float = OVERLAP_RATE, word_token_rate: float = WORD_TOKEN_RATE, - base_url: Optional[str] = None, - api_base: Optional[str] = None, - extra_args: Dict = None, # char_token_rate: float = WORD_TOKEN_RATE * 5, # chunk_mode: str = "char", verbose: bool = False, logger: Optional[AsyncLogger] = None, ignore_cache: bool = True, + # Deprecated properties + provider: str = DEFAULT_PROVIDER, + api_token: Optional[str] = None, + base_url: Optional[str] = None, + api_base: Optional[str] = None, + extra_args: Dict = None, ): super().__init__(None) self.provider = provider self.api_token = api_token self.base_url = base_url or api_base - self.llmConfig = llmConfig + self.llm_config = llm_config self.instruction = instruction self.chunk_token_threshold = chunk_token_threshold self.overlap_rate = overlap_rate @@ -872,7 +891,7 @@ class LLMContentFilter(RelevantContentFilter): self.logger.info( "Starting LLM markdown content filtering process", tag="LLM", - params={"provider": self.llmConfig.provider}, + params={"provider": self.llm_config.provider}, colors={"provider": Fore.CYAN}, ) @@ -959,10 +978,10 @@ class LLMContentFilter(RelevantContentFilter): future = executor.submit( _proceed_with_chunk, - self.llmConfig.provider, + self.llm_config.provider, prompt, - self.llmConfig.api_token, - self.llmConfig.base_url, + self.llm_config.api_token, + self.llm_config.base_url, self.extra_args, ) futures.append((i, future)) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 719cab8e..ef622abe 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -155,6 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): for aud in raw_result.get("media", {}).get("audios", []) if aud ], + tables=raw_result.get("media", {}).get("tables", []) ) # Convert links @@ -193,6 +194,153 @@ class WebScrapingStrategy(ContentScrapingStrategy): """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) + def is_data_table(self, table: Tag, **kwargs) -> bool: + """ + Determine if a table element is a data table (not a layout table). + + Args: + table (Tag): BeautifulSoup Tag representing a table element + **kwargs: Additional keyword arguments including table_score_threshold + + Returns: + bool: True if the table is a data table, False otherwise + """ + score = 0 + + # Check for thead and tbody + has_thead = len(table.select('thead')) > 0 + has_tbody = len(table.select('tbody')) > 0 + if has_thead: + score += 2 + if has_tbody: + score += 1 + + # Check for th elements + th_count = len(table.select('th')) + if th_count > 0: + score += 2 + if has_thead or len(table.select('tr:first-child th')) > 0: + score += 1 + + # Check for nested tables + if len(table.select('table')) > 0: + score -= 3 + + # Role attribute check + role = table.get('role', '').lower() + if role in {'presentation', 'none'}: + score -= 3 + + # Column consistency + rows = table.select('tr') + if not rows: + return False + + col_counts = [len(row.select('td, th')) for row in rows] + avg_cols = sum(col_counts) / len(col_counts) + variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts) + if variance < 1: + score += 2 + + # Caption and summary + if table.select('caption'): + score += 2 + if table.has_attr('summary') and table['summary']: + score += 1 + + # Text density + total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th')) + total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag)) + text_ratio = total_text / (total_tags + 1e-5) + if text_ratio > 20: + score += 3 + elif text_ratio > 10: + score += 2 + + # Data attributes + data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-')) + score += data_attrs * 0.5 + + # Size check + if avg_cols >= 2 and len(rows) >= 2: + score += 2 + + threshold = kwargs.get('table_score_threshold', 7) + return score >= threshold + + def extract_table_data(self, table: Tag) -> dict: + """ + Extract structured data from a table element. + + Args: + table (Tag): BeautifulSoup Tag representing a table element + + Returns: + dict: Dictionary containing table data (headers, rows, caption, summary) + """ + caption_elem = table.select_one('caption') + caption = caption_elem.get_text().strip() if caption_elem else "" + summary = table.get('summary', '').strip() + + # Extract headers with colspan handling + headers = [] + thead_rows = table.select('thead tr') + if thead_rows: + header_cells = thead_rows[0].select('th') + for cell in header_cells: + text = cell.get_text().strip() + colspan = int(cell.get('colspan', 1)) + headers.extend([text] * colspan) + else: + first_row = table.select('tr:first-child') + if first_row: + for cell in first_row[0].select('th, td'): + text = cell.get_text().strip() + colspan = int(cell.get('colspan', 1)) + headers.extend([text] * colspan) + + # Extract rows with colspan handling + rows = [] + all_rows = table.select('tr') + thead = table.select_one('thead') + tbody_rows = [] + + if thead: + thead_rows = thead.select('tr') + tbody_rows = [row for row in all_rows if row not in thead_rows] + else: + if all_rows and all_rows[0].select('th'): + tbody_rows = all_rows[1:] + else: + tbody_rows = all_rows + + for row in tbody_rows: + # for row in table.select('tr:not(:has(ancestor::thead))'): + row_data = [] + for cell in row.select('td'): + text = cell.get_text().strip() + colspan = int(cell.get('colspan', 1)) + row_data.extend([text] * colspan) + if row_data: + rows.append(row_data) + + # Align rows with headers + max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0) + aligned_rows = [] + for row in rows: + aligned = row[:max_columns] + [''] * (max_columns - len(row)) + aligned_rows.append(aligned) + + if not headers: + headers = [f"Column {i+1}" for i in range(max_columns)] + + return { + "headers": headers, + "rows": aligned_rows, + "caption": caption, + "summary": summary, + } + def flatten_nested_elements(self, node): """ Flatten nested elements in a HTML tree. @@ -431,7 +579,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: dict: A dictionary containing the processed element information. """ - media = {"images": [], "videos": [], "audios": []} + media = {"images": [], "videos": [], "audios": [], "tables": []} internal_links_dict = {} external_links_dict = {} self._process_element( @@ -691,6 +839,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, + target_elements: List[str] = None, **kwargs, ) -> Dict[str, Any]: """ @@ -745,22 +894,37 @@ class WebScrapingStrategy(ContentScrapingStrategy): for element in body.select(excluded_selector): element.extract() - if css_selector: - selected_elements = body.select(css_selector) - if not selected_elements: - return { - "markdown": "", - "cleaned_html": "", - "success": True, - "media": {"images": [], "videos": [], "audios": []}, - "links": {"internal": [], "external": []}, - "metadata": {}, - "message": f"No elements found for CSS selector: {css_selector}", - } - # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") - body = soup.new_tag("div") - for el in selected_elements: - body.append(el) + # if False and css_selector: + # selected_elements = body.select(css_selector) + # if not selected_elements: + # return { + # "markdown": "", + # "cleaned_html": "", + # "success": True, + # "media": {"images": [], "videos": [], "audios": []}, + # "links": {"internal": [], "external": []}, + # "metadata": {}, + # "message": f"No elements found for CSS selector: {css_selector}", + # } + # # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") + # body = soup.new_tag("div") + # for el in selected_elements: + # body.append(el) + + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.select(target_element)) + content_element = soup.new_tag("div") + for el in for_content_targeted_element: + content_element.append(el) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -800,6 +964,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): if result is not None for img in result ] + + # Process tables if not excluded + excluded_tags = set(kwargs.get("excluded_tags", []) or []) + if 'table' not in excluded_tags: + tables = body.find_all('table') + for table in tables: + if self.is_data_table(table, **kwargs): + table_data = self.extract_table_data(table) + media["tables"].append(table_data) body = self.flatten_nested_elements(body) base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') @@ -811,7 +984,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = "" try: - str_body = body.encode_contents().decode("utf-8") + str_body = content_element.encode_contents().decode("utf-8") except Exception: # Reset body to the original HTML success = False @@ -850,7 +1023,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ") return { - # **markdown_content, "cleaned_html": cleaned_html, "success": success, "media": media, @@ -1193,12 +1365,125 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): return root + def is_data_table(self, table: etree.Element, **kwargs) -> bool: + score = 0 + # Check for thead and tbody + has_thead = len(table.xpath(".//thead")) > 0 + has_tbody = len(table.xpath(".//tbody")) > 0 + if has_thead: + score += 2 + if has_tbody: + score += 1 + + # Check for th elements + th_count = len(table.xpath(".//th")) + if th_count > 0: + score += 2 + if has_thead or table.xpath(".//tr[1]/th"): + score += 1 + + # Check for nested tables + if len(table.xpath(".//table")) > 0: + score -= 3 + + # Role attribute check + role = table.get("role", "").lower() + if role in {"presentation", "none"}: + score -= 3 + + # Column consistency + rows = table.xpath(".//tr") + if not rows: + return False + col_counts = [len(row.xpath(".//td|.//th")) for row in rows] + avg_cols = sum(col_counts) / len(col_counts) + variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts) + if variance < 1: + score += 2 + + # Caption and summary + if table.xpath(".//caption"): + score += 2 + if table.get("summary"): + score += 1 + + # Text density + total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th")) + total_tags = sum(1 for _ in table.iterdescendants()) + text_ratio = total_text / (total_tags + 1e-5) + if text_ratio > 20: + score += 3 + elif text_ratio > 10: + score += 2 + + # Data attributes + data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-')) + score += data_attrs * 0.5 + + # Size check + if avg_cols >= 2 and len(rows) >= 2: + score += 2 + + threshold = kwargs.get("table_score_threshold", 7) + return score >= threshold + + def extract_table_data(self, table: etree.Element) -> dict: + caption = table.xpath(".//caption/text()") + caption = caption[0].strip() if caption else "" + summary = table.get("summary", "").strip() + + # Extract headers with colspan handling + headers = [] + thead_rows = table.xpath(".//thead/tr") + if thead_rows: + header_cells = thead_rows[0].xpath(".//th") + for cell in header_cells: + text = cell.text_content().strip() + colspan = int(cell.get("colspan", 1)) + headers.extend([text] * colspan) + else: + first_row = table.xpath(".//tr[1]") + if first_row: + for cell in first_row[0].xpath(".//th|.//td"): + text = cell.text_content().strip() + colspan = int(cell.get("colspan", 1)) + headers.extend([text] * colspan) + + # Extract rows with colspan handling + rows = [] + for row in table.xpath(".//tr[not(ancestor::thead)]"): + row_data = [] + for cell in row.xpath(".//td"): + text = cell.text_content().strip() + colspan = int(cell.get("colspan", 1)) + row_data.extend([text] * colspan) + if row_data: + rows.append(row_data) + + # Align rows with headers + max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0) + aligned_rows = [] + for row in rows: + aligned = row[:max_columns] + [''] * (max_columns - len(row)) + aligned_rows.append(aligned) + + if not headers: + headers = [f"Column {i+1}" for i in range(max_columns)] + + return { + "headers": headers, + "rows": aligned_rows, + "caption": caption, + "summary": summary, + } + def _scrap( self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, + target_elements: List[str] = None, **kwargs, ) -> Dict[str, Any]: if not html: @@ -1249,24 +1534,38 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): meta = {} # Handle CSS selector targeting - if css_selector: + # if css_selector: + # try: + # selected_elements = body.cssselect(css_selector) + # if not selected_elements: + # return { + # "markdown": "", + # "cleaned_html": "", + # "success": True, + # "media": {"images": [], "videos": [], "audios": []}, + # "links": {"internal": [], "external": []}, + # "metadata": meta, + # "message": f"No elements found for CSS selector: {css_selector}", + # } + # body = lhtml.Element("div") + # body.extend(selected_elements) + # except Exception as e: + # self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") + # return None + + content_element = None + if target_elements: try: - selected_elements = body.cssselect(css_selector) - if not selected_elements: - return { - "markdown": "", - "cleaned_html": "", - "success": True, - "media": {"images": [], "videos": [], "audios": []}, - "links": {"internal": [], "external": []}, - "metadata": meta, - "message": f"No elements found for CSS selector: {css_selector}", - } - body = lhtml.Element("div") - body.extend(selected_elements) + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) except Exception as e: - self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None + else: + content_element = body # Remove script and style tags for tag in ["script", "style", "link", "meta", "noscript"]: @@ -1290,7 +1589,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): form.getparent().remove(form) # Process content - media = {"images": [], "videos": [], "audios": []} + media = {"images": [], "videos": [], "audios": [], "tables": []} internal_links_dict = {} external_links_dict = {} @@ -1304,6 +1603,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): **kwargs, ) + if 'table' not in excluded_tags: + tables = body.xpath(".//table") + for table in tables: + if self.is_data_table(table, **kwargs): + table_data = self.extract_table_data(table) + media["tables"].append(table_data) + # Handle only_text option if kwargs.get("only_text", False): for tag in ONLY_TEXT_ELIGIBLE_TAGS: @@ -1330,7 +1636,8 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): # Generate output HTML cleaned_html = lhtml.tostring( - body, + # body, + content_element, encoding="unicode", pretty_print=True, method="html", @@ -1375,7 +1682,12 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): return { "cleaned_html": cleaned_html, "success": False, - "media": {"images": [], "videos": [], "audios": []}, + "media": { + "images": [], + "videos": [], + "audios": [], + "tables": [] + }, "links": {"internal": [], "external": []}, "metadata": {}, } diff --git a/crawl4ai/deep_crawling/base_strategy.py b/crawl4ai/deep_crawling/base_strategy.py index 222338a3..e1b3fe6b 100644 --- a/crawl4ai/deep_crawling/base_strategy.py +++ b/crawl4ai/deep_crawling/base_strategy.py @@ -16,7 +16,7 @@ class DeepCrawlDecorator: def __call__(self, original_arun): @wraps(original_arun) - async def wrapped_arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs): + async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs): # If deep crawling is already active, call the original method to avoid recursion. if config and config.deep_crawl_strategy and not self.deep_crawl_active.get(): token = self.deep_crawl_active.set(True) diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index f1e871ee..4811ba14 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -12,6 +12,7 @@ from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn +from math import inf as infinity # Configurable batch size for processing items from the priority queue BATCH_SIZE = 10 @@ -37,15 +38,18 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): filter_chain: FilterChain = FilterChain(), url_scorer: Optional[URLScorer] = None, include_external: bool = False, + max_pages: int = infinity, logger: Optional[logging.Logger] = None, ): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer self.include_external = include_external + self.max_pages = max_pages self.logger = logger or logging.getLogger(__name__) self.stats = TraversalStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() + self._pages_crawled = 0 async def can_process_url(self, url: str, depth: int) -> bool: """ @@ -86,12 +90,20 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): new_depth = current_depth + 1 if new_depth > self.max_depth: return + + # If we've reached the max pages limit, don't discover new links + remaining_capacity = self.max_pages - self._pages_crawled + if remaining_capacity <= 0: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery") + return # Retrieve internal links; include external links if enabled. links = result.links.get("internal", []) if self.include_external: links += result.links.get("external", []) + # If we have more links than remaining capacity, limit how many we'll process + valid_links = [] for link in links: url = link.get("href") if url in visited: @@ -99,8 +111,16 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): if not await self.can_process_url(url, new_depth): self.stats.urls_skipped += 1 continue - - # Record the new depth. + + valid_links.append(url) + + # If we have more valid links than capacity, limit them + if len(valid_links) > remaining_capacity: + valid_links = valid_links[:remaining_capacity] + self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit") + + # Record the new depths and add to next_links + for url in valid_links: depths[url] = new_depth next_links.append((url, source_url)) @@ -123,6 +143,11 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): depths: Dict[str, int] = {start_url: 0} while not queue.empty() and not self._cancel_event.is_set(): + # Stop if we've reached the max pages limit + if self._pages_crawled >= self.max_pages: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl") + break + batch: List[Tuple[float, int, str, Optional[str]]] = [] # Retrieve up to BATCH_SIZE items from the priority queue. for _ in range(BATCH_SIZE): @@ -153,14 +178,23 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): result.metadata["depth"] = depth result.metadata["parent_url"] = parent_url result.metadata["score"] = score + + # Count only successful crawls toward max_pages limit + if result.success: + self._pages_crawled += 1 + yield result - # Discover new links from this result. - new_links: List[Tuple[str, Optional[str]]] = [] - await self.link_discovery(result, result_url, depth, visited, new_links, depths) - for new_url, new_parent in new_links: - new_depth = depths.get(new_url, depth + 1) - new_score = self.url_scorer.score(new_url) if self.url_scorer else 0 - await queue.put((new_score, new_depth, new_url, new_parent)) + + # Only discover links from successful crawls + if result.success: + # Discover new links from this result + new_links: List[Tuple[str, Optional[str]]] = [] + await self.link_discovery(result, result_url, depth, visited, new_links, depths) + + for new_url, new_parent in new_links: + new_depth = depths.get(new_url, depth + 1) + new_score = self.url_scorer.score(new_url) if self.url_scorer else 0 + await queue.put((new_score, new_depth, new_url, new_parent)) # End of crawl. diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 48c0c240..54b72ea3 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -10,6 +10,8 @@ from .filters import FilterChain from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult +from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl +from math import inf as infinity class BFSDeepCrawlStrategy(DeepCrawlStrategy): """ @@ -24,17 +26,22 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self, max_depth: int, filter_chain: FilterChain = FilterChain(), - url_scorer: Optional[URLScorer] = None, + url_scorer: Optional[URLScorer] = None, include_external: bool = False, + score_threshold: float = -infinity, + max_pages: int = infinity, logger: Optional[logging.Logger] = None, ): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer self.include_external = include_external + self.score_threshold = score_threshold + self.max_pages = max_pages self.logger = logger or logging.getLogger(__name__) self.stats = TraversalStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() + self._pages_crawled = 0 async def can_process_url(self, url: str, depth: int) -> bool: """ @@ -72,28 +79,59 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): prepares the next level of URLs. Each valid URL is appended to next_level as a tuple (url, parent_url) and its depth is tracked. - """ + """ next_depth = current_depth + 1 if next_depth > self.max_depth: return + # If we've reached the max pages limit, don't discover new links + remaining_capacity = self.max_pages - self._pages_crawled + if remaining_capacity <= 0: + self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery") + return + # Get internal links and, if enabled, external links. links = result.links.get("internal", []) if self.include_external: links += result.links.get("external", []) + valid_links = [] + + # First collect all valid links for link in links: url = link.get("href") - if url in visited: + # Strip URL fragments to avoid duplicate crawling + # base_url = url.split('#')[0] if url else url + base_url = normalize_url_for_deep_crawl(url, source_url) + if base_url in visited: continue if not await self.can_process_url(url, next_depth): self.stats.urls_skipped += 1 continue - # Score the URL if a scorer is provided. In this simple BFS - # the score is not used for ordering. - score = self.url_scorer.score(url) if self.url_scorer else 0 - # attach the score to metadata if needed. + # Score the URL if a scorer is provided + score = self.url_scorer.score(base_url) if self.url_scorer else 0 + + # Skip URLs with scores below the threshold + if score < self.score_threshold: + self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}") + self.stats.urls_skipped += 1 + continue + + valid_links.append((base_url, score)) + + # If we have more valid links than capacity, sort by score and take the top ones + if len(valid_links) > remaining_capacity: + if self.url_scorer: + # Sort by score in descending order + valid_links.sort(key=lambda x: x[1], reverse=True) + # Take only as many as we have capacity for + valid_links = valid_links[:remaining_capacity] + self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit") + + # Process the final selected links + for url, score in valid_links: + # attach the score to metadata if needed if score: result.metadata = result.metadata or {} result.metadata["score"] = score @@ -125,7 +163,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # Clone the config to disable deep crawling recursion and enforce batch mode. batch_config = config.clone(deep_crawl_strategy=None, stream=False) batch_results = await crawler.arun_many(urls=urls, config=batch_config) - + + # Update pages crawled counter - count only successful crawls + successful_results = [r for r in batch_results if r.success] + self._pages_crawled += len(successful_results) + for result in batch_results: url = result.url depth = depths.get(url, 0) @@ -134,7 +176,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): parent_url = next((parent for (u, parent) in current_level if u == url), None) result.metadata["parent_url"] = parent_url results.append(result) - await self.link_discovery(result, url, depth, visited, next_level, depths) + + # Only discover links from successful crawls + if result.success: + # Link discovery will handle the max pages limit internally + await self.link_discovery(result, url, depth, visited, next_level, depths) current_level = next_level @@ -161,6 +207,9 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): stream_config = config.clone(deep_crawl_strategy=None, stream=True) stream_gen = await crawler.arun_many(urls=urls, config=stream_config) + + # Keep track of processed results for this batch + results_count = 0 async for result in stream_gen: url = result.url depth = depths.get(url, 0) @@ -168,9 +217,24 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): result.metadata["depth"] = depth parent_url = next((parent for (u, parent) in current_level if u == url), None) result.metadata["parent_url"] = parent_url + + # Count only successful crawls + if result.success: + self._pages_crawled += 1 + + results_count += 1 yield result - await self.link_discovery(result, url, depth, visited, next_level, depths) - + + # Only discover links from successful crawls + if result.success: + # Link discovery will handle the max pages limit internally + await self.link_discovery(result, url, depth, visited, next_level, depths) + + # If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop + # by considering these URLs as visited but not counting them toward the max_pages limit + if results_count == 0 and urls: + self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited") + current_level = next_level async def shutdown(self) -> None: diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py index 423315f0..f79f9628 100644 --- a/crawl4ai/deep_crawling/dfs_strategy.py +++ b/crawl4ai/deep_crawling/dfs_strategy.py @@ -37,6 +37,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): # Clone config to disable recursive deep crawling. batch_config = config.clone(deep_crawl_strategy=None, stream=False) url_results = await crawler.arun_many(urls=[url], config=batch_config) + for result in url_results: result.metadata = result.metadata or {} result.metadata["depth"] = depth @@ -44,13 +45,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): if self.url_scorer: result.metadata["score"] = self.url_scorer.score(url) results.append(result) - - new_links: List[Tuple[str, Optional[str]]] = [] - await self.link_discovery(result, url, depth, visited, new_links, depths) - # Push new links in reverse order so the first discovered is processed next. - for new_url, new_parent in reversed(new_links): - new_depth = depths.get(new_url, depth + 1) - stack.append((new_url, new_parent, new_depth)) + + # Count only successful crawls toward max_pages limit + if result.success: + self._pages_crawled += 1 + + # Only discover links from successful crawls + new_links: List[Tuple[str, Optional[str]]] = [] + await self.link_discovery(result, url, depth, visited, new_links, depths) + + # Push new links in reverse order so the first discovered is processed next. + for new_url, new_parent in reversed(new_links): + new_depth = depths.get(new_url, depth + 1) + stack.append((new_url, new_parent, new_depth)) return results async def _arun_stream( @@ -83,8 +90,13 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): result.metadata["score"] = self.url_scorer.score(url) yield result - new_links: List[Tuple[str, Optional[str]]] = [] - await self.link_discovery(result, url, depth, visited, new_links, depths) - for new_url, new_parent in reversed(new_links): - new_depth = depths.get(new_url, depth + 1) - stack.append((new_url, new_parent, new_depth)) + # Only count successful crawls toward max_pages limit + # and only discover links from successful crawls + if result.success: + self._pages_crawled += 1 + + new_links: List[Tuple[str, Optional[str]]] = [] + await self.link_discovery(result, url, depth, visited, new_links, depths) + for new_url, new_parent in reversed(new_links): + new_depth = depths.get(new_url, depth + 1) + stack.append((new_url, new_parent, new_depth)) diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index c8af3022..122be482 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -124,6 +124,7 @@ class URLPatternFilter(URLFilter): "_simple_prefixes", "_domain_patterns", "_path_patterns", + "_reverse", ) PATTERN_TYPES = { @@ -138,8 +139,10 @@ class URLPatternFilter(URLFilter): self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True, + reverse: bool = False, ): super().__init__() + self._reverse = reverse patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns self._simple_suffixes = set() @@ -205,36 +208,40 @@ class URLPatternFilter(URLFilter): @lru_cache(maxsize=10000) def apply(self, url: str) -> bool: - """Hierarchical pattern matching""" # Quick suffix check (*.html) if self._simple_suffixes: path = url.split("?")[0] if path.split("/")[-1].split(".")[-1] in self._simple_suffixes: - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result # Domain check if self._domain_patterns: for pattern in self._domain_patterns: if pattern.match(url): - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result # Prefix check (/foo/*) if self._simple_prefixes: path = url.split("?")[0] if any(path.startswith(p) for p in self._simple_prefixes): - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result # Complex patterns if self._path_patterns: if any(p.search(url) for p in self._path_patterns): - self._update_stats(True) - return True + result = True + self._update_stats(result) + return not result if self._reverse else result - self._update_stats(False) - return False + result = False + self._update_stats(result) + return not result if self._reverse else result class ContentTypeFilter(URLFilter): @@ -427,6 +434,11 @@ class DomainFilter(URLFilter): if isinstance(domains, str): return {domains.lower()} return {d.lower() for d in domains} + + @staticmethod + def _is_subdomain(domain: str, parent_domain: str) -> bool: + """Check if domain is a subdomain of parent_domain""" + return domain == parent_domain or domain.endswith(f".{parent_domain}") @staticmethod @lru_cache(maxsize=10000) @@ -444,20 +456,26 @@ class DomainFilter(URLFilter): domain = self._extract_domain(url) - # Early return for blocked domains - if domain in self._blocked_domains: - self._update_stats(False) - return False + # Check for blocked domains, including subdomains + for blocked in self._blocked_domains: + if self._is_subdomain(domain, blocked): + self._update_stats(False) + return False # If no allowed domains specified, accept all non-blocked if self._allowed_domains is None: self._update_stats(True) return True - # Final allowed domains check - result = domain in self._allowed_domains - self._update_stats(result) - return result + # Check if domain matches any allowed domain (including subdomains) + for allowed in self._allowed_domains: + if self._is_subdomain(domain, allowed): + self._update_stats(True) + return True + + # No matches found + self._update_stats(False) + return False class ContentRelevanceFilter(URLFilter): diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index ebd826a2..97512bf3 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -4,12 +4,10 @@ from typing import Any, List, Dict, Optional from concurrent.futures import ThreadPoolExecutor, as_completed import json import time -import os from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH from .config import ( - DEFAULT_PROVIDER, PROVIDER_MODELS, - CHUNK_TOKEN_THRESHOLD, + DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD, OVERLAP_RATE, WORD_TOKEN_RATE, ) @@ -22,9 +20,7 @@ from .utils import ( extract_xml_data, split_and_parse_json_objects, sanitize_input_encode, - chunk_documents, merge_chunks, - advanced_split, ) from .models import * # noqa: F403 @@ -38,8 +34,9 @@ from .model_loader import ( calculate_batch_size ) +from .types import LLMConfig + from functools import partial -import math import numpy as np import re from bs4 import BeautifulSoup @@ -481,8 +478,7 @@ class LLMExtractionStrategy(ExtractionStrategy): A strategy that uses an LLM to extract meaningful content from the HTML. Attributes: - provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". - api_token: The API token for the provider. + llm_config: The LLM configuration object. instruction: The instruction to use for the LLM model. schema: Pydantic model schema for structured data. extraction_type: "block" or "schema". @@ -490,27 +486,20 @@ class LLMExtractionStrategy(ExtractionStrategy): overlap_rate: Overlap between chunks. word_token_rate: Word to token conversion rate. apply_chunking: Whether to apply chunking. - base_url: The base URL for the API request. - api_base: The base URL for the API request. - extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. verbose: Whether to print verbose output. usages: List of individual token usages. total_usage: Accumulated token usage. """ _UNWANTED_PROPS = { - 'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")', - 'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")', - 'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")', - 'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")', + 'provider' : 'Instead, use llm_config=LLMConfig(provider="...")', + 'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")', + 'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")', + 'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")', } def __init__( self, - llmConfig: 'LLMConfig' = None, + llm_config: 'LLMConfig' = None, instruction: str = None, - provider: str = DEFAULT_PROVIDER, - api_token: Optional[str] = None, - base_url: str = None, - api_base: str = None, schema: Dict = None, extraction_type="block", chunk_token_threshold=CHUNK_TOKEN_THRESHOLD, @@ -519,15 +508,18 @@ class LLMExtractionStrategy(ExtractionStrategy): apply_chunking=True, input_format: str = "markdown", verbose=False, + # Deprecated arguments + provider: str = DEFAULT_PROVIDER, + api_token: Optional[str] = None, + base_url: str = None, + api_base: str = None, **kwargs, ): """ Initialize the strategy with clustering parameters. Args: - llmConfig: The LLM configuration object. - provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". - api_token: The API token for the provider. + llm_config: The LLM configuration object. instruction: The instruction to use for the LLM model. schema: Pydantic model schema for structured data. extraction_type: "block" or "schema". @@ -535,20 +527,19 @@ class LLMExtractionStrategy(ExtractionStrategy): overlap_rate: Overlap between chunks. word_token_rate: Word to token conversion rate. apply_chunking: Whether to apply chunking. - base_url: The base URL for the API request. - api_base: The base URL for the API request. - extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. verbose: Whether to print verbose output. usages: List of individual token usages. total_usage: Accumulated token usage. + # Deprecated arguments, will be removed very soon + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. """ super().__init__( input_format=input_format, **kwargs) - self.llmConfig = llmConfig - self.provider = provider - self.api_token = api_token - self.base_url = base_url - self.api_base = api_base + self.llm_config = llm_config self.instruction = instruction self.extract_type = extraction_type self.schema = schema @@ -565,6 +556,11 @@ class LLMExtractionStrategy(ExtractionStrategy): self.usages = [] # Store individual usages self.total_usage = TokenUsage() # Accumulated usage + self.provider = provider + self.api_token = api_token + self.base_url = base_url + self.api_base = api_base + def __setattr__(self, name, value): """Handle attribute setting.""" @@ -618,10 +614,10 @@ class LLMExtractionStrategy(ExtractionStrategy): ) response = perform_completion_with_backoff( - self.llmConfig.provider, + self.llm_config.provider, prompt_with_variables, - self.llmConfig.api_token, - base_url=self.llmConfig.base_url, + self.llm_config.api_token, + base_url=self.llm_config.base_url, extra_args=self.extra_args, ) # , json_response=self.extract_type == "schema") # Track usage @@ -701,7 +697,7 @@ class LLMExtractionStrategy(ExtractionStrategy): overlap=int(self.chunk_token_threshold * self.overlap_rate), ) extracted_content = [] - if self.llmConfig.provider.startswith("groq/"): + if self.llm_config.provider.startswith("groq/"): # Sequential processing with a delay for ix, section in enumerate(merged_sections): extract_func = partial(self.extract, url) @@ -1043,8 +1039,8 @@ class JsonElementExtractionStrategy(ExtractionStrategy): pass _GENERATE_SCHEMA_UNWANTED_PROPS = { - 'provider': 'Instead, use llmConfig=LlmConfig(provider="...")', - 'api_token': 'Instead, use llmConfig=LlMConfig(api_token="...")', + 'provider': 'Instead, use llm_config=LLMConfig(provider="...")', + 'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")', } @staticmethod @@ -1053,7 +1049,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy): schema_type: str = "CSS", # or XPATH query: str = None, target_json_example: str = None, - llmConfig: 'LLMConfig' = None, + llm_config: 'LLMConfig' = None, provider: str = None, api_token: str = None, **kwargs @@ -1066,9 +1062,9 @@ class JsonElementExtractionStrategy(ExtractionStrategy): query (str, optional): Natural language description of what data to extract provider (str): Legacy Parameter. LLM provider to use api_token (str): Legacy Parameter. API token for LLM provider - llmConfig (LlmConfig): LLM configuration object + llm_config (LLMConfig): LLM configuration object prompt (str, optional): Custom prompt template to use - **kwargs: Additional args passed to perform_completion_with_backoff + **kwargs: Additional args passed to LLM processor Returns: dict: Generated schema following the JsonElementExtractionStrategy format @@ -1130,11 +1126,12 @@ In this scenario, use your best judgment to generate the schema. Try to maximize try: # Call LLM with backoff handling response = perform_completion_with_backoff( - provider=llmConfig.provider, + provider=llm_config.provider, prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), json_response = True, - api_token=llmConfig.api_token, - **kwargs + api_token=llm_config.api_token, + base_url=llm_config.base_url, + extra_args=kwargs ) # Extract and return schema @@ -1171,7 +1168,8 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): super().__init__(schema, **kwargs) def _parse_html(self, html_content: str): - return BeautifulSoup(html_content, "html.parser") + # return BeautifulSoup(html_content, "html.parser") + return BeautifulSoup(html_content, "lxml") def _get_base_elements(self, parsed_html, selector: str): return parsed_html.select(selector) @@ -1190,6 +1188,373 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): def _get_element_attribute(self, element, attribute: str): return element.get(attribute) +class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs["input_format"] = "html" + super().__init__(schema, **kwargs) + self._selector_cache = {} + self._xpath_cache = {} + self._result_cache = {} + + # Control selector optimization strategy + self.use_caching = kwargs.get("use_caching", True) + self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True) + + # Load lxml dependencies once + from lxml import etree, html + from lxml.cssselect import CSSSelector + self.etree = etree + self.html_parser = html + self.CSSSelector = CSSSelector + + def _parse_html(self, html_content: str): + """Parse HTML content with error recovery""" + try: + parser = self.etree.HTMLParser(recover=True, remove_blank_text=True) + return self.etree.fromstring(html_content, parser) + except Exception as e: + if self.verbose: + print(f"Error parsing HTML, falling back to alternative method: {e}") + try: + return self.html_parser.fromstring(html_content) + except Exception as e2: + if self.verbose: + print(f"Critical error parsing HTML: {e2}") + # Create minimal document as fallback + return self.etree.Element("html") + + def _optimize_selector(self, selector_str): + """Optimize common selector patterns for better performance""" + if not self.optimize_common_patterns: + return selector_str + + # Handle td:nth-child(N) pattern which is very common in table scraping + import re + if re.search(r'td:nth-child\(\d+\)', selector_str): + return selector_str # Already handled specially in _apply_selector + + # Split complex selectors into parts for optimization + parts = selector_str.split() + if len(parts) <= 1: + return selector_str + + # For very long selectors, consider using just the last specific part + if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts): + specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')] + if specific_parts: + return specific_parts[-1] # Use most specific class/id selector + + return selector_str + + def _create_selector_function(self, selector_str): + """Create a selector function that handles all edge cases""" + original_selector = selector_str + + # Try to optimize the selector if appropriate + if self.optimize_common_patterns: + selector_str = self._optimize_selector(selector_str) + + try: + # Attempt to compile the CSS selector + compiled = self.CSSSelector(selector_str) + xpath = compiled.path + + # Store XPath for later use + self._xpath_cache[selector_str] = xpath + + # Create the wrapper function that implements the selection strategy + def selector_func(element, context_sensitive=True): + cache_key = None + + # Use result caching if enabled + if self.use_caching: + # Create a cache key based on element and selector + element_id = element.get('id', '') or str(hash(element)) + cache_key = f"{element_id}::{selector_str}" + + if cache_key in self._result_cache: + return self._result_cache[cache_key] + + results = [] + try: + # Strategy 1: Direct CSS selector application (fastest) + results = compiled(element) + + # If that fails and we need context sensitivity + if not results and context_sensitive: + # Strategy 2: Try XPath with context adjustment + context_xpath = self._make_context_sensitive_xpath(xpath, element) + if context_xpath: + results = element.xpath(context_xpath) + + # Strategy 3: Handle special case - nth-child + if not results and 'nth-child' in original_selector: + results = self._handle_nth_child_selector(element, original_selector) + + # Strategy 4: Direct descendant search for class/ID selectors + if not results: + results = self._fallback_class_id_search(element, original_selector) + + # Strategy 5: Last resort - tag name search for the final part + if not results: + parts = original_selector.split() + if parts: + last_part = parts[-1] + # Extract tag name from the selector + tag_match = re.match(r'^(\w+)', last_part) + if tag_match: + tag_name = tag_match.group(1) + results = element.xpath(f".//{tag_name}") + + # Cache results if caching is enabled + if self.use_caching and cache_key: + self._result_cache[cache_key] = results + + except Exception as e: + if self.verbose: + print(f"Error applying selector '{selector_str}': {e}") + + return results + + return selector_func + + except Exception as e: + if self.verbose: + print(f"Error compiling selector '{selector_str}': {e}") + + # Fallback function for invalid selectors + return lambda element, context_sensitive=True: [] + + def _make_context_sensitive_xpath(self, xpath, element): + """Convert absolute XPath to context-sensitive XPath""" + try: + # If starts with descendant-or-self, it's already context-sensitive + if xpath.startswith('descendant-or-self::'): + return xpath + + # Remove leading slash if present + if xpath.startswith('/'): + context_xpath = f".{xpath}" + else: + context_xpath = f".//{xpath}" + + # Validate the XPath by trying it + try: + element.xpath(context_xpath) + return context_xpath + except: + # If that fails, try a simpler descendant search + return f".//{xpath.split('/')[-1]}" + except: + return None + + def _handle_nth_child_selector(self, element, selector_str): + """Special handling for nth-child selectors in tables""" + import re + results = [] + + try: + # Extract the column number from td:nth-child(N) + match = re.search(r'td:nth-child\((\d+)\)', selector_str) + if match: + col_num = match.group(1) + + # Check if there's content after the nth-child part + remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip() + + if remaining_selector: + # If there's a specific element we're looking for after the column + # Extract any tag names from the remaining selector + tag_match = re.search(r'(\w+)', remaining_selector) + tag_name = tag_match.group(1) if tag_match else '*' + results = element.xpath(f".//td[{col_num}]//{tag_name}") + else: + # Just get the column cell + results = element.xpath(f".//td[{col_num}]") + except Exception as e: + if self.verbose: + print(f"Error handling nth-child selector: {e}") + + return results + + def _fallback_class_id_search(self, element, selector_str): + """Fallback to search by class or ID""" + results = [] + + try: + # Extract class selectors (.classname) + import re + class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str) + + # Extract ID selectors (#idname) + id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str) + + # Try each class + for class_name in class_matches: + class_results = element.xpath(f".//*[contains(@class, '{class_name}')]") + results.extend(class_results) + + # Try each ID (usually more specific) + for id_name in id_matches: + id_results = element.xpath(f".//*[@id='{id_name}']") + results.extend(id_results) + except Exception as e: + if self.verbose: + print(f"Error in fallback class/id search: {e}") + + return results + + def _get_selector(self, selector_str): + """Get or create a selector function with caching""" + if selector_str not in self._selector_cache: + self._selector_cache[selector_str] = self._create_selector_function(selector_str) + return self._selector_cache[selector_str] + + def _get_base_elements(self, parsed_html, selector: str): + """Get all base elements using the selector""" + selector_func = self._get_selector(selector) + # For base elements, we don't need context sensitivity + return selector_func(parsed_html, context_sensitive=False) + + def _get_elements(self, element, selector: str): + """Get child elements using the selector with context sensitivity""" + selector_func = self._get_selector(selector) + return selector_func(element, context_sensitive=True) + + def _get_element_text(self, element) -> str: + """Extract normalized text from element""" + try: + # Get all text nodes and normalize + text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip()) + return text + except Exception as e: + if self.verbose: + print(f"Error extracting text: {e}") + # Fallback + try: + return element.text_content().strip() + except: + return "" + + def _get_element_html(self, element) -> str: + """Get HTML string representation of element""" + try: + return self.etree.tostring(element, encoding='unicode', method='html') + except Exception as e: + if self.verbose: + print(f"Error serializing HTML: {e}") + return "" + + def _get_element_attribute(self, element, attribute: str): + """Get attribute value safely""" + try: + return element.get(attribute) + except Exception as e: + if self.verbose: + print(f"Error getting attribute '{attribute}': {e}") + return None + + def _clear_caches(self): + """Clear caches to free memory""" + if self.use_caching: + self._result_cache.clear() + +class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs["input_format"] = "html" # Force HTML input + super().__init__(schema, **kwargs) + self._selector_cache = {} + + def _parse_html(self, html_content: str): + from lxml import etree + parser = etree.HTMLParser(recover=True) + return etree.fromstring(html_content, parser) + + def _get_selector(self, selector_str): + """Get a selector function that works within the context of an element""" + if selector_str not in self._selector_cache: + from lxml.cssselect import CSSSelector + try: + # Store both the compiled selector and its xpath translation + compiled = CSSSelector(selector_str) + + # Create a function that will apply this selector appropriately + def select_func(element): + try: + # First attempt: direct CSS selector application + results = compiled(element) + if results: + return results + + # Second attempt: contextual XPath selection + # Convert the root-based XPath to a context-based XPath + xpath = compiled.path + + # If the XPath already starts with descendant-or-self, handle it specially + if xpath.startswith('descendant-or-self::'): + context_xpath = xpath + else: + # For normal XPath expressions, make them relative to current context + context_xpath = f"./{xpath.lstrip('/')}" + + results = element.xpath(context_xpath) + if results: + return results + + # Final fallback: simple descendant search for common patterns + if 'nth-child' in selector_str: + # Handle td:nth-child(N) pattern + import re + match = re.search(r'td:nth-child\((\d+)\)', selector_str) + if match: + col_num = match.group(1) + sub_selector = selector_str.split(')', 1)[-1].strip() + if sub_selector: + return element.xpath(f".//td[{col_num}]//{sub_selector}") + else: + return element.xpath(f".//td[{col_num}]") + + # Last resort: try each part of the selector separately + parts = selector_str.split() + if len(parts) > 1 and parts[-1]: + return element.xpath(f".//{parts[-1]}") + + return [] + except Exception as e: + if self.verbose: + print(f"Error applying selector '{selector_str}': {e}") + return [] + + self._selector_cache[selector_str] = select_func + except Exception as e: + if self.verbose: + print(f"Error compiling selector '{selector_str}': {e}") + + # Fallback function for invalid selectors + def fallback_func(element): + return [] + + self._selector_cache[selector_str] = fallback_func + + return self._selector_cache[selector_str] + + def _get_base_elements(self, parsed_html, selector: str): + selector_func = self._get_selector(selector) + return selector_func(parsed_html) + + def _get_elements(self, element, selector: str): + selector_func = self._get_selector(selector) + return selector_func(element) + + def _get_element_text(self, element) -> str: + return "".join(element.xpath(".//text()")).strip() + + def _get_element_html(self, element) -> str: + from lxml import etree + return etree.tostring(element, encoding='unicode') + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): """ diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index f37abc18..e89239f3 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod -from tabnanny import verbose from typing import Optional, Dict, Any, Tuple from .models import MarkdownGenerationResult from .html2text import CustomHTML2Text +# from .types import RelevantContentFilter from .content_filter_strategy import RelevantContentFilter import re from urllib.parse import urljoin diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 4c96adf0..474e679e 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -37,13 +37,33 @@ class CrawlStatus(Enum): FAILED = "FAILED" +# @dataclass +# class CrawlStats: +# task_id: str +# url: str +# status: CrawlStatus +# start_time: Optional[datetime] = None +# end_time: Optional[datetime] = None +# memory_usage: float = 0.0 +# peak_memory: float = 0.0 +# error_message: str = "" + +# @property +# def duration(self) -> str: +# if not self.start_time: +# return "0:00" +# end = self.end_time or datetime.now() +# duration = end - self.start_time +# return str(timedelta(seconds=int(duration.total_seconds()))) + + @dataclass class CrawlStats: task_id: str url: str status: CrawlStatus - start_time: Optional[datetime] = None - end_time: Optional[datetime] = None + start_time: Optional[Union[datetime, float]] = None + end_time: Optional[Union[datetime, float]] = None memory_usage: float = 0.0 peak_memory: float = 0.0 error_message: str = "" @@ -52,11 +72,21 @@ class CrawlStats: def duration(self) -> str: if not self.start_time: return "0:00" + + # Convert start_time to datetime if it's a float + start = self.start_time + if isinstance(start, float): + start = datetime.fromtimestamp(start) + + # Get end time or use current time end = self.end_time or datetime.now() - duration = end - self.start_time + # Convert end_time to datetime if it's a float + if isinstance(end, float): + end = datetime.fromtimestamp(end) + + duration = end - start return str(timedelta(seconds=int(duration.total_seconds()))) - class DisplayMode(Enum): DETAILED = "DETAILED" AGGREGATED = "AGGREGATED" @@ -149,7 +179,11 @@ class CrawlResult(BaseModel): markdown_result = data.pop('markdown', None) super().__init__(**data) if markdown_result is not None: - self._markdown = markdown_result + self._markdown = ( + MarkdownGenerationResult(**markdown_result) + if isinstance(markdown_result, dict) + else markdown_result + ) @property def markdown(self): @@ -292,6 +326,7 @@ class Media(BaseModel): audios: List[ MediaItem ] = [] # Using MediaItem model for now, can be extended with Audio model if needed + tables: List[Dict] = [] # Table data extracted from HTML tables class Links(BaseModel): diff --git a/crawl4ai/proxy_strategy.py b/crawl4ai/proxy_strategy.py index 0776e68a..6821c566 100644 --- a/crawl4ai/proxy_strategy.py +++ b/crawl4ai/proxy_strategy.py @@ -1,8 +1,119 @@ from typing import List, Dict, Optional from abc import ABC, abstractmethod from itertools import cycle +import os + + +class ProxyConfig: + def __init__( + self, + server: str, + username: Optional[str] = None, + password: Optional[str] = None, + ip: Optional[str] = None, + ): + """Configuration class for a single proxy. + + Args: + server: Proxy server URL (e.g., "http://127.0.0.1:8080") + username: Optional username for proxy authentication + password: Optional password for proxy authentication + ip: Optional IP address for verification purposes + """ + self.server = server + self.username = username + self.password = password + + # Extract IP from server if not explicitly provided + self.ip = ip or self._extract_ip_from_server() + + def _extract_ip_from_server(self) -> Optional[str]: + """Extract IP address from server URL.""" + try: + # Simple extraction assuming http://ip:port format + if "://" in self.server: + parts = self.server.split("://")[1].split(":") + return parts[0] + else: + parts = self.server.split(":") + return parts[0] + except Exception: + return None + + @staticmethod + def from_string(proxy_str: str) -> "ProxyConfig": + """Create a ProxyConfig from a string in the format 'ip:port:username:password'.""" + parts = proxy_str.split(":") + if len(parts) == 4: # ip:port:username:password + ip, port, username, password = parts + return ProxyConfig( + server=f"http://{ip}:{port}", + username=username, + password=password, + ip=ip + ) + elif len(parts) == 2: # ip:port only + ip, port = parts + return ProxyConfig( + server=f"http://{ip}:{port}", + ip=ip + ) + else: + raise ValueError(f"Invalid proxy string format: {proxy_str}") + + @staticmethod + def from_dict(proxy_dict: Dict) -> "ProxyConfig": + """Create a ProxyConfig from a dictionary.""" + return ProxyConfig( + server=proxy_dict.get("server"), + username=proxy_dict.get("username"), + password=proxy_dict.get("password"), + ip=proxy_dict.get("ip") + ) + + @staticmethod + def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]: + """Load proxies from environment variable. + + Args: + env_var: Name of environment variable containing comma-separated proxy strings + + Returns: + List of ProxyConfig objects + """ + proxies = [] + try: + proxy_list = os.getenv(env_var, "").split(",") + for proxy in proxy_list: + if not proxy: + continue + proxies.append(ProxyConfig.from_string(proxy)) + except Exception as e: + print(f"Error loading proxies from environment: {e}") + return proxies + + def to_dict(self) -> Dict: + """Convert to dictionary representation.""" + return { + "server": self.server, + "username": self.username, + "password": self.password, + "ip": self.ip + } + + def clone(self, **kwargs) -> "ProxyConfig": + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + ProxyConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return ProxyConfig.from_dict(config_dict) -from crawl4ai.configs import ProxyConfig class ProxyRotationStrategy(ABC): """Base abstract class for proxy rotation strategies""" diff --git a/crawl4ai/types.py b/crawl4ai/types.py index 7c2586a3..2f689e1c 100644 --- a/crawl4ai/types.py +++ b/crawl4ai/types.py @@ -1,14 +1,181 @@ from typing import TYPE_CHECKING, Union -AsyncWebCrawler = Union['AsyncWebCrawlerType'] # Note the string literal -CrawlerRunConfig = Union['CrawlerRunConfigType'] +# Logger types +AsyncLoggerBase = Union['AsyncLoggerBaseType'] +AsyncLogger = Union['AsyncLoggerType'] + +# Crawler core types +AsyncWebCrawler = Union['AsyncWebCrawlerType'] +CacheMode = Union['CacheModeType'] CrawlResult = Union['CrawlResultType'] +CrawlerHub = Union['CrawlerHubType'] +BrowserProfiler = Union['BrowserProfilerType'] + +# Configuration types +BrowserConfig = Union['BrowserConfigType'] +CrawlerRunConfig = Union['CrawlerRunConfigType'] +HTTPCrawlerConfig = Union['HTTPCrawlerConfigType'] +LLMConfig = Union['LLMConfigType'] + +# Content scraping types +ContentScrapingStrategy = Union['ContentScrapingStrategyType'] +WebScrapingStrategy = Union['WebScrapingStrategyType'] +LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType'] + +# Proxy types +ProxyRotationStrategy = Union['ProxyRotationStrategyType'] +RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType'] + +# Extraction types +ExtractionStrategy = Union['ExtractionStrategyType'] +LLMExtractionStrategy = Union['LLMExtractionStrategyType'] +CosineStrategy = Union['CosineStrategyType'] +JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType'] +JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType'] + +# Chunking types +ChunkingStrategy = Union['ChunkingStrategyType'] +RegexChunking = Union['RegexChunkingType'] + +# Markdown generation types +DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType'] +MarkdownGenerationResult = Union['MarkdownGenerationResultType'] + +# Content filter types +RelevantContentFilter = Union['RelevantContentFilterType'] +PruningContentFilter = Union['PruningContentFilterType'] +BM25ContentFilter = Union['BM25ContentFilterType'] +LLMContentFilter = Union['LLMContentFilterType'] + +# Dispatcher types +BaseDispatcher = Union['BaseDispatcherType'] +MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType'] +SemaphoreDispatcher = Union['SemaphoreDispatcherType'] +RateLimiter = Union['RateLimiterType'] +CrawlerMonitor = Union['CrawlerMonitorType'] +DisplayMode = Union['DisplayModeType'] RunManyReturn = Union['RunManyReturnType'] +# Docker client +Crawl4aiDockerClient = Union['Crawl4aiDockerClientType'] + +# Deep crawling types +DeepCrawlStrategy = Union['DeepCrawlStrategyType'] +BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType'] +FilterChain = Union['FilterChainType'] +ContentTypeFilter = Union['ContentTypeFilterType'] +DomainFilter = Union['DomainFilterType'] +URLFilter = Union['URLFilterType'] +FilterStats = Union['FilterStatsType'] +SEOFilter = Union['SEOFilterType'] +KeywordRelevanceScorer = Union['KeywordRelevanceScorerType'] +URLScorer = Union['URLScorerType'] +CompositeScorer = Union['CompositeScorerType'] +DomainAuthorityScorer = Union['DomainAuthorityScorerType'] +FreshnessScorer = Union['FreshnessScorerType'] +PathDepthScorer = Union['PathDepthScorerType'] +BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType'] +DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType'] +DeepCrawlDecorator = Union['DeepCrawlDecoratorType'] + +# Only import types during type checking to avoid circular imports if TYPE_CHECKING: - from . import ( + # Logger imports + from .async_logger import ( + AsyncLoggerBase as AsyncLoggerBaseType, + AsyncLogger as AsyncLoggerType, + ) + + # Crawler core imports + from .async_webcrawler import ( AsyncWebCrawler as AsyncWebCrawlerType, + CacheMode as CacheModeType, + ) + from .models import CrawlResult as CrawlResultType + from .hub import CrawlerHub as CrawlerHubType + from .browser_profiler import BrowserProfiler as BrowserProfilerType + + # Configuration imports + from .async_configs import ( + BrowserConfig as BrowserConfigType, CrawlerRunConfig as CrawlerRunConfigType, - CrawlResult as CrawlResultType, + HTTPCrawlerConfig as HTTPCrawlerConfigType, + LLMConfig as LLMConfigType, + ) + + # Content scraping imports + from .content_scraping_strategy import ( + ContentScrapingStrategy as ContentScrapingStrategyType, + WebScrapingStrategy as WebScrapingStrategyType, + LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType, + ) + + # Proxy imports + from .proxy_strategy import ( + ProxyRotationStrategy as ProxyRotationStrategyType, + RoundRobinProxyStrategy as RoundRobinProxyStrategyType, + ) + + # Extraction imports + from .extraction_strategy import ( + ExtractionStrategy as ExtractionStrategyType, + LLMExtractionStrategy as LLMExtractionStrategyType, + CosineStrategy as CosineStrategyType, + JsonCssExtractionStrategy as JsonCssExtractionStrategyType, + JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType, + ) + + # Chunking imports + from .chunking_strategy import ( + ChunkingStrategy as ChunkingStrategyType, + RegexChunking as RegexChunkingType, + ) + + # Markdown generation imports + from .markdown_generation_strategy import ( + DefaultMarkdownGenerator as DefaultMarkdownGeneratorType, + ) + from .models import MarkdownGenerationResult as MarkdownGenerationResultType + + # Content filter imports + from .content_filter_strategy import ( + RelevantContentFilter as RelevantContentFilterType, + PruningContentFilter as PruningContentFilterType, + BM25ContentFilter as BM25ContentFilterType, + LLMContentFilter as LLMContentFilterType, + ) + + # Dispatcher imports + from .async_dispatcher import ( + BaseDispatcher as BaseDispatcherType, + MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType, + SemaphoreDispatcher as SemaphoreDispatcherType, + RateLimiter as RateLimiterType, + CrawlerMonitor as CrawlerMonitorType, + DisplayMode as DisplayModeType, RunManyReturn as RunManyReturnType, + ) + + # Docker client + from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType + + # Deep crawling imports + from .deep_crawling import ( + DeepCrawlStrategy as DeepCrawlStrategyType, + BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType, + FilterChain as FilterChainType, + ContentTypeFilter as ContentTypeFilterType, + DomainFilter as DomainFilterType, + URLFilter as URLFilterType, + FilterStats as FilterStatsType, + SEOFilter as SEOFilterType, + KeywordRelevanceScorer as KeywordRelevanceScorerType, + URLScorer as URLScorerType, + CompositeScorer as CompositeScorerType, + DomainAuthorityScorer as DomainAuthorityScorerType, + FreshnessScorer as FreshnessScorerType, + PathDepthScorer as PathDepthScorerType, + BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType, + DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType, + DeepCrawlDecorator as DeepCrawlDecoratorType, ) \ No newline at end of file diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 91e7a31d..df212568 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -3,12 +3,11 @@ from typing import Optional, Literal, List, Dict, Tuple import re from abc import ABC, abstractmethod -import random from fake_useragent import UserAgent import requests from lxml import html import json -from typing import Optional, List, Union, Dict +from typing import Union class UAGen(ABC): @abstractmethod diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b5a50eab..146ce06c 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1,5 +1,4 @@ import time -from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString import json @@ -33,6 +32,8 @@ import hashlib from urllib.robotparser import RobotFileParser import aiohttp +from urllib.parse import urlparse, urlunparse +from functools import lru_cache from packaging import version from . import __version__ @@ -1962,6 +1963,82 @@ def normalize_url(href, base_url): return normalized +def normalize_url_for_deep_crawl(href, base_url): + """Normalize URLs to ensure consistent format""" + from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode + + # Handle None or empty values + if not href: + return None + + # Use urljoin to handle relative URLs + full_url = urljoin(base_url, href.strip()) + + # Parse the URL for normalization + parsed = urlparse(full_url) + + # Convert hostname to lowercase + netloc = parsed.netloc.lower() + + # Remove fragment entirely + fragment = '' + + # Normalize query parameters if needed + query = parsed.query + if query: + # Parse query parameters + params = parse_qs(query) + + # Remove tracking parameters (example - customize as needed) + tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid'] + for param in tracking_params: + if param in params: + del params[param] + + # Rebuild query string, sorted for consistency + query = urlencode(params, doseq=True) if params else '' + + # Build normalized URL + normalized = urlunparse(( + parsed.scheme, + netloc, + parsed.path.rstrip('/') or '/', # Normalize trailing slash + parsed.params, + query, + fragment + )) + + return normalized + +@lru_cache(maxsize=10000) +def efficient_normalize_url_for_deep_crawl(href, base_url): + """Efficient URL normalization with proper parsing""" + from urllib.parse import urljoin + + if not href: + return None + + # Resolve relative URLs + full_url = urljoin(base_url, href.strip()) + + # Use proper URL parsing + parsed = urlparse(full_url) + + # Only perform the most critical normalizations + # 1. Lowercase hostname + # 2. Remove fragment + normalized = urlunparse(( + parsed.scheme, + parsed.netloc.lower(), + parsed.path, + parsed.params, + parsed.query, + '' # Remove fragment + )) + + return normalized + + def normalize_url_tmp(href, base_url): """Normalize URLs to ensure consistent format""" # Extract protocol and domain from base URL diff --git a/deploy/docker/README.md b/deploy/docker/README.md index fbed6576..c4582031 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -352,7 +352,10 @@ Example: from crawl4ai import CrawlerRunConfig, PruningContentFilter config = CrawlerRunConfig( - content_filter=PruningContentFilter(threshold=0.48) + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed") + ), + cache_mode= CacheMode.BYPASS ) print(config.dump()) # Use this JSON in your API calls ``` @@ -595,8 +598,8 @@ curl http://localhost:8000/health ## Complete Examples Check out the `examples` folder in our repository for full working examples! Here are two to get you started: -[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk_example.py) -[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api_example.py) +[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py) +[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py) ## Server Configuration diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 9d0b8c3c..cc103905 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -18,7 +18,8 @@ from crawl4ai import ( CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, - RateLimiter + RateLimiter, + LLMConfig ) from crawl4ai.utils import perform_completion_with_backoff from crawl4ai.content_filter_strategy import ( @@ -103,8 +104,10 @@ async def process_llm_extraction( else: api_key = os.environ.get(config["llm"].get("api_key_env", None), "") llm_strategy = LLMExtractionStrategy( - provider=config["llm"]["provider"], - api_token=api_key, + llm_config=LLMConfig( + provider=config["llm"]["provider"], + api_token=api_key + ), instruction=instruction, schema=json.loads(schema) if schema else None, ) @@ -164,8 +167,10 @@ async def handle_markdown_request( FilterType.FIT: PruningContentFilter(), FilterType.BM25: BM25ContentFilter(user_query=query or ""), FilterType.LLM: LLMContentFilter( - provider=config["llm"]["provider"], - api_token=os.environ.get(config["llm"].get("api_key_env", None), ""), + llm_config=LLMConfig( + provider=config["llm"]["provider"], + api_token=os.environ.get(config["llm"].get("api_key_env", None), ""), + ), instruction=query or "Extract main content" ) }[filter_type] diff --git a/deploy/docker/auth.py b/deploy/docker/auth.py index 8851bd36..f9e75d78 100644 --- a/deploy/docker/auth.py +++ b/deploy/docker/auth.py @@ -10,7 +10,7 @@ from pydantic.main import BaseModel import base64 instance = JWT() -security = HTTPBearer() +security = HTTPBearer(auto_error=False) SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret") ACCESS_TOKEN_EXPIRE_MINUTES = 60 @@ -30,6 +30,9 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) - def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict: """Verify the JWT token from the Authorization header.""" + + if credentials is None: + return None token = credentials.credentials verifying_key = get_jwk_from_secret(SECRET_KEY) try: @@ -38,9 +41,15 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) except Exception: raise HTTPException(status_code=401, detail="Invalid or expired token") + def get_token_dependency(config: Dict): - """Return the token dependency if JWT is enabled, else None.""" - return verify_token if config.get("security", {}).get("jwt_enabled", False) else None + """Return the token dependency if JWT is enabled, else a function that returns None.""" + + if config.get("security", {}).get("jwt_enabled", False): + return verify_token + else: + return lambda: None + class TokenRequest(BaseModel): email: EmailStr \ No newline at end of file diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml index fc118bf4..6ad7bb7a 100644 --- a/deploy/docker/config.yml +++ b/deploy/docker/config.yml @@ -3,7 +3,7 @@ app: title: "Crawl4AI API" version: "1.0.0" host: "0.0.0.0" - port: 8000 + port: 8020 reload: True timeout_keep_alive: 300 @@ -38,8 +38,8 @@ rate_limiting: # Security Configuration security: - enabled: true - jwt_enabled: true + enabled: false + jwt_enabled: false https_redirect: false trusted_hosts: ["*"] headers: diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py new file mode 100644 index 00000000..4160ba35 --- /dev/null +++ b/docs/examples/crypto_analysis_example.py @@ -0,0 +1,230 @@ +""" +Crawl4AI Crypto Trading Analysis Demo +Author: Unclecode +Date: 2024-03-15 + +This script demonstrates advanced crypto market analysis using: +1. Web scraping of real-time CoinMarketCap data +2. Smart table extraction with layout detection +3. Hedge fund-grade financial metrics +4. Interactive visualizations for trading signals + +Key Features: +- Volume Anomaly Detection: Finds unusual trading activity +- Liquidity Power Score: Identifies easily tradable assets +- Volatility-Weighted Momentum: Surface sustainable trends +- Smart Money Signals: Algorithmic buy/hold recommendations +""" + +import asyncio +import pandas as pd +import plotly.express as px +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LXMLWebScrapingStrategy +from crawl4ai import CrawlResult +from typing import List +from IPython.display import HTML + +class CryptoAlphaGenerator: + """ + Advanced crypto analysis engine that transforms raw web data into: + - Volume anomaly flags + - Liquidity scores + - Momentum-risk ratios + - Machine learning-inspired trading signals + + Methods: + analyze_tables(): Process raw tables into trading insights + create_visuals(): Generate institutional-grade visualizations + generate_insights(): Create plain English trading recommendations + """ + + def clean_data(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Convert crypto market data to machine-readable format + Handles currency symbols, units (B=Billions), and percentage values + """ + # Clean numeric columns + df['Price'] = df['Price'].str.replace('[^\d.]', '', regex=True).astype(float) + df['Market Cap'] = df['Market Cap'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9 + df['Volume(24h)'] = df['Volume(24h)'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9 + + # Convert percentages to decimal values + for col in ['1h %', '24h %', '7d %']: + df[col] = df[col].str.replace('%', '').astype(float) / 100 + + return df + + def calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Compute advanced trading metrics used by quantitative funds: + + 1. Volume/Market Cap Ratio - Measures liquidity efficiency + (High ratio = Underestimated attention) + + 2. Volatility Score - Risk-adjusted momentum potential + (STD of 1h/24h/7d returns) + + 3. Momentum Score - Weighted average of returns + (1h:30% + 24h:50% + 7d:20%) + + 4. Volume Anomaly - 3Οƒ deviation detection + (Flags potential insider activity) + """ + # Liquidity Metrics + df['Volume/Market Cap Ratio'] = df['Volume(24h)'] / df['Market Cap'] + + # Risk Metrics + df['Volatility Score'] = df[['1h %','24h %','7d %']].std(axis=1) + + # Momentum Metrics + df['Momentum Score'] = (df['1h %']*0.3 + df['24h %']*0.5 + df['7d %']*0.2) + + # Anomaly Detection + median_vol = df['Volume(24h)'].median() + df['Volume Anomaly'] = df['Volume(24h)'] > 3 * median_vol + + # Value Flags + df['Undervalued Flag'] = (df['Market Cap'] < 1e9) & (df['Momentum Score'] > 0.05) + df['Liquid Giant'] = (df['Volume/Market Cap Ratio'] > 0.15) & (df['Market Cap'] > 1e9) + + return df + + def create_visuals(self, df: pd.DataFrame) -> dict: + """ + Generate three institutional-grade visualizations: + + 1. 3D Market Map - X:Size, Y:Liquidity, Z:Momentum + 2. Liquidity Tree - Color:Volume Efficiency + 3. Momentum Leaderboard - Top sustainable movers + """ + # 3D Market Overview + fig1 = px.scatter_3d( + df, + x='Market Cap', + y='Volume/Market Cap Ratio', + z='Momentum Score', + size='Volatility Score', + color='Volume Anomaly', + hover_name='Name', + title='Smart Money Market Map: Spot Overlooked Opportunities', + labels={'Market Cap': 'Size (Log $)', 'Volume/Market Cap Ratio': 'Liquidity Power'}, + log_x=True, + template='plotly_dark' + ) + + # Liquidity Efficiency Tree + fig2 = px.treemap( + df, + path=['Name'], + values='Market Cap', + color='Volume/Market Cap Ratio', + hover_data=['Momentum Score'], + title='Liquidity Forest: Green = High Trading Efficiency', + color_continuous_scale='RdYlGn' + ) + + # Momentum Leaders + fig3 = px.bar( + df.sort_values('Momentum Score', ascending=False).head(10), + x='Name', + y='Momentum Score', + color='Volatility Score', + title='Sustainable Momentum Leaders (Low Volatility + High Growth)', + text='7d %', + template='plotly_dark' + ) + + return {'market_map': fig1, 'liquidity_tree': fig2, 'momentum_leaders': fig3} + + def generate_insights(self, df: pd.DataFrame) -> str: + """ + Create plain English trading insights explaining: + - Volume spikes and their implications + - Risk-reward ratios of top movers + - Liquidity warnings for large positions + """ + top_coin = df.sort_values('Momentum Score', ascending=False).iloc[0] + anomaly_coins = df[df['Volume Anomaly']].sort_values('Volume(24h)', ascending=False) + + report = f""" + πŸš€ Top Alpha Opportunity: {top_coin['Name']} + - Momentum Score: {top_coin['Momentum Score']:.2%} (Top 1%) + - Risk-Reward Ratio: {top_coin['Momentum Score']/top_coin['Volatility Score']:.1f} + - Liquidity Warning: {'βœ… Safe' if top_coin['Liquid Giant'] else '⚠️ Thin Markets'} + + πŸ”₯ Volume Spikes Detected ({len(anomaly_coins)} coins): + {anomaly_coins[['Name', 'Volume(24h)']].head(3).to_markdown(index=False)} + + πŸ’‘ Smart Money Tip: Coins with Volume/Cap > 15% and Momentum > 5% + historically outperform by 22% weekly returns. + """ + return report + +async def main(): + """ + Main execution flow: + 1. Configure headless browser for scraping + 2. Extract live crypto market data + 3. Clean and analyze using hedge fund models + 4. Generate visualizations and insights + 5. Output professional trading report + """ + # Configure browser with anti-detection features + browser_config = BrowserConfig( + headless=True, + stealth=True, + block_resources=["image", "media"] + ) + + # Initialize crawler with smart table detection + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + # Set up scraping parameters + crawl_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + scraping_strategy=LXMLWebScrapingStrategy( + table_score_threshold=8, # Strict table detection + keep_data_attributes=True + ) + ) + + # Execute market data extraction + results: List[CrawlResult] = await crawler.arun( + url='https://coinmarketcap.com/?page=1', + config=crawl_config + ) + + # Process results + for result in results: + if result.success and result.media['tables']: + # Extract primary market table + raw_df = pd.DataFrame( + result.media['tables'][0]['rows'], + columns=result.media['tables'][0]['headers'] + ) + + # Initialize analysis engine + analyzer = CryptoAlphaGenerator() + clean_df = analyzer.clean_data(raw_df) + analyzed_df = analyzer.calculate_metrics(clean_df) + + # Generate outputs + visuals = analyzer.create_visuals(analyzed_df) + insights = analyzer.generate_insights(analyzed_df) + + # Save visualizations + visuals['market_map'].write_html("market_map.html") + visuals['liquidity_tree'].write_html("liquidity_tree.html") + + # Display results + print("πŸ”‘ Key Trading Insights:") + print(insights) + print("\nπŸ“Š Open 'market_map.html' for interactive analysis") + + finally: + await crawler.close() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/deepcrawl.py b/docs/examples/deepcrawl_example.py similarity index 75% rename from docs/examples/deepcrawl.py rename to docs/examples/deepcrawl_example.py index 6df716af..741c0039 100644 --- a/docs/examples/deepcrawl.py +++ b/docs/examples/deepcrawl_example.py @@ -65,7 +65,6 @@ async def basic_deep_crawl(): f"\nβœ… Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds" ) - # 2️⃣ Stream vs. Non-Stream Execution async def stream_vs_nonstream(): """ @@ -80,7 +79,7 @@ async def stream_vs_nonstream(): base_config = CrawlerRunConfig( deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False), scraping_strategy=LXMLWebScrapingStrategy(), - verbose=True, + verbose=False, ) async with AsyncWebCrawler() as crawler: @@ -127,7 +126,6 @@ async def stream_vs_nonstream(): print(f" βœ… All results: {time.perf_counter() - start_time:.2f} seconds") print("\nπŸ” Key Takeaway: Streaming allows processing results immediately") - # 3️⃣ Introduce Filters & Scorers async def filters_and_scorers(): """ @@ -212,11 +210,11 @@ async def filters_and_scorers(): # Create a keyword relevance scorer keyword_scorer = KeywordRelevanceScorer( - keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3 + keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1 ) config = CrawlerRunConfig( - deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst + deep_crawl_strategy=BestFirstCrawlingStrategy( max_depth=1, include_external=False, url_scorer=keyword_scorer ), scraping_strategy=LXMLWebScrapingStrategy(), @@ -236,11 +234,172 @@ async def filters_and_scorers(): print(f" βœ… Crawler prioritized {len(results)} pages by relevance score") print(" πŸ” Note: BestFirstCrawlingStrategy visits highest-scoring pages first") +# 4️⃣ Advanced Filters +async def advanced_filters(): + """ + PART 4: Demonstrates advanced filtering techniques for specialized crawling. -# 4️⃣ Wrap-Up and Key Takeaways + This function covers: + - SEO filters + - Text relevancy filtering + - Combining advanced filters + """ + print("\n===== ADVANCED FILTERS =====") + + async with AsyncWebCrawler() as crawler: + # SEO FILTER EXAMPLE + print("\nπŸ“Š EXAMPLE 1: SEO FILTERS") + print( + "Quantitative SEO quality assessment filter based searching keywords in the head section" + ) + + seo_filter = SEOFilter( + threshold=0.5, keywords=["dynamic", "interaction", "javascript"] + ) + + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=1, filter_chain=FilterChain([seo_filter]) + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + ) + + results = await crawler.arun(url="https://docs.crawl4ai.com", config=config) + + print(f" βœ… Found {len(results)} pages with relevant keywords") + for result in results: + print(f" β†’ {result.url}") + + # ADVANCED TEXT RELEVANCY FILTER + print("\nπŸ“Š EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER") + + # More sophisticated content relevance filter + relevance_filter = ContentRelevanceFilter( + query="Interact with the web using your authentic digital identity", + threshold=0.7, + ) + + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=1, filter_chain=FilterChain([relevance_filter]) + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + ) + + results = await crawler.arun(url="https://docs.crawl4ai.com", config=config) + + print(f" βœ… Found {len(results)} pages") + for result in results: + relevance_score = result.metadata.get("relevance_score", 0) + print(f" β†’ Score: {relevance_score:.2f} | {result.url}") + +# 5️⃣ Max Pages and Score Thresholds +async def max_pages_and_thresholds(): + """ + PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies. + + This function shows: + - How to limit the number of pages crawled + - How to set score thresholds for more targeted crawling + - Comparing BFS, DFS, and Best-First strategies with these parameters + """ + print("\n===== MAX PAGES AND SCORE THRESHOLDS =====") + + from crawl4ai.deep_crawling import DFSDeepCrawlStrategy + + async with AsyncWebCrawler() as crawler: + # Define a common keyword scorer for all examples + keyword_scorer = KeywordRelevanceScorer( + keywords=["browser", "crawler", "web", "automation"], + weight=1.0 + ) + + # EXAMPLE 1: BFS WITH MAX PAGES + print("\nπŸ“Š EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT") + print(" Limit the crawler to a maximum of 5 pages") + + bfs_config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + max_pages=5 # Only crawl 5 pages + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + ) + + results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config) + + print(f" βœ… Crawled exactly {len(results)} pages as specified by max_pages") + for result in results: + depth = result.metadata.get("depth", 0) + print(f" β†’ Depth: {depth} | {result.url}") + + # EXAMPLE 2: DFS WITH SCORE THRESHOLD + print("\nπŸ“Š EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD") + print(" Only crawl pages with a relevance score above 0.5") + + dfs_config = CrawlerRunConfig( + deep_crawl_strategy=DFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + score_threshold=0.7, # Only process URLs with scores above 0.5 + max_pages=10 + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + ) + + results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config) + + print(f" βœ… Crawled {len(results)} pages with scores above threshold") + for result in results: + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f" β†’ Depth: {depth} | Score: {score:.2f} | {result.url}") + + # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS + print("\nπŸ“Š EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS") + print(" Limit to 7 pages with scores above 0.3, prioritizing highest scores") + + bf_config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + max_pages=7, # Limit to 7 pages total + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True, + cache_mode=CacheMode.BYPASS, + stream=True, + ) + + results = [] + async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config): + results.append(result) + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f" β†’ Depth: {depth} | Score: {score:.2f} | {result.url}") + + print(f" βœ… Crawled {len(results)} high-value pages with scores above 0.3") + if results: + avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results) + print(f" βœ… Average score: {avg_score:.2f}") + print(" πŸ” Note: BestFirstCrawlingStrategy visited highest-scoring pages first") + +# 6️⃣ Wrap-Up and Key Takeaways async def wrap_up(): """ - PART 4: Wrap-Up and Key Takeaways + PART 6: Wrap-Up and Key Takeaways Summarize the key concepts learned in this tutorial. """ @@ -308,71 +467,6 @@ async def wrap_up(): print(f" Depth {depth}: {count} pages") -# 5️⃣ Advanced Filters -async def advanced_filters(): - """ - PART 5: Demonstrates advanced filtering techniques for specialized crawling. - - This function covers: - - SEO filters - - Text relevancy filtering - - Combining advanced filters - """ - print("\n===== ADVANCED FILTERS =====") - - async with AsyncWebCrawler() as crawler: - # SEO FILTER EXAMPLE - print("\nπŸ“Š EXAMPLE 1: SEO FILTERS") - print( - "Quantitative SEO quality assessment filter based searching keywords in the head section" - ) - - seo_filter = SEOFilter( - threshold=0.5, keywords=["dynamic", "interaction", "javascript"] - ) - - config = CrawlerRunConfig( - deep_crawl_strategy=BFSDeepCrawlStrategy( - max_depth=1, filter_chain=FilterChain([seo_filter]) - ), - scraping_strategy=LXMLWebScrapingStrategy(), - verbose=True, - cache_mode=CacheMode.BYPASS, - ) - - results = await crawler.arun(url="https://docs.crawl4ai.com", config=config) - - print(f" βœ… Found {len(results)} pages with relevant keywords") - for result in results: - print(f" β†’ {result.url}") - - # ADVANCED TEXT RELEVANCY FILTER - print("\nπŸ“Š EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER") - - # More sophisticated content relevance filter - relevance_filter = ContentRelevanceFilter( - query="Interact with the web using your authentic digital identity", - threshold=0.7, - ) - - config = CrawlerRunConfig( - deep_crawl_strategy=BFSDeepCrawlStrategy( - max_depth=1, filter_chain=FilterChain([relevance_filter]) - ), - scraping_strategy=LXMLWebScrapingStrategy(), - verbose=True, - cache_mode=CacheMode.BYPASS, - ) - - results = await crawler.arun(url="https://docs.crawl4ai.com", config=config) - - print(f" βœ… Found {len(results)} pages") - for result in results: - relevance_score = result.metadata.get("relevance_score", 0) - print(f" β†’ Score: {relevance_score:.2f} | {result.url}") - - -# Main function to run the entire tutorial async def run_tutorial(): """ Executes all tutorial sections in sequence. @@ -387,8 +481,9 @@ async def run_tutorial(): basic_deep_crawl, stream_vs_nonstream, filters_and_scorers, - wrap_up, + max_pages_and_thresholds, advanced_filters, + wrap_up, ] for section in tutorial_sections: @@ -398,7 +493,6 @@ async def run_tutorial(): print("You now have a comprehensive understanding of deep crawling with Crawl4AI.") print("For more information, check out https://docs.crawl4ai.com") - # Execute the tutorial when run directly if __name__ == "__main__": asyncio.run(run_tutorial()) \ No newline at end of file diff --git a/docs/examples/dispatcher_example.py b/docs/examples/dispatcher_example.py index cac08186..8ac24d3b 100644 --- a/docs/examples/dispatcher_example.py +++ b/docs/examples/dispatcher_example.py @@ -39,7 +39,7 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config): start = time.perf_counter() async with AsyncWebCrawler(config=browser_config) as crawler: dispatcher = MemoryAdaptiveDispatcher( - memory_threshold_percent=70.0, + memory_threshold_percent=95.0, max_session_permit=10, rate_limiter=RateLimiter( base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2 diff --git a/docs/examples/extraction_strategies_examples.py b/docs/examples/extraction_strategies_examples.py index 56ab124f..84192f97 100644 --- a/docs/examples/extraction_strategies_examples.py +++ b/docs/examples/extraction_strategies_examples.py @@ -11,7 +11,7 @@ import asyncio import os from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, JsonCssExtractionStrategy, @@ -61,19 +61,19 @@ async def main(): # 1. LLM Extraction with different input formats markdown_strategy = LLMExtractionStrategy( - llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), + llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information including name, price, and description", ) html_strategy = LLMExtractionStrategy( input_format="html", - llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), + llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information from HTML including structured data", ) fit_markdown_strategy = LLMExtractionStrategy( input_format="fit_markdown", - llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), + llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract product information from cleaned markdown", ) diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index 021b24b6..c44908d5 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -16,9 +16,9 @@ async def main(): crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( - content_filter=PruningContentFilter( - threshold=0.48, threshold_type="fixed", min_word_threshold=0 - ) + # content_filter=PruningContentFilter( + # threshold=0.48, threshold_type="fixed", min_word_threshold=0 + # ) ), ) result : CrawlResult = await crawler.arun( diff --git a/docs/examples/identity_based_browsing.py b/docs/examples/identity_based_browsing.py new file mode 100644 index 00000000..01596948 --- /dev/null +++ b/docs/examples/identity_based_browsing.py @@ -0,0 +1,108 @@ +""" +Identity-Based Browsing Example with Crawl4AI + +This example demonstrates how to: +1. Create a persistent browser profile interactively +2. List available profiles +3. Use a saved profile for crawling authenticated sites +4. Delete profiles when no longer needed + +Uses the new BrowserProfiler class for profile management. +""" + +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig +from crawl4ai.browser_profiler import BrowserProfiler +from crawl4ai.async_logger import AsyncLogger +from colorama import Fore, Style, init + +# Initialize colorama +init() + +# Create a shared logger instance +logger = AsyncLogger(verbose=True) + +# Create a shared BrowserProfiler instance +profiler = BrowserProfiler(logger=logger) + + +async def crawl_with_profile(profile_path, url): + """Use a profile to crawl an authenticated page""" + logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL") + + # Create browser config with the profile path + browser_config = BrowserConfig( + headless=False, # Set to False if you want to see the browser window + use_managed_browser=True, # Required for persistent profiles + user_data_dir=profile_path + ) + + start_time = asyncio.get_event_loop().time() + + # Initialize crawler with the browser config + async with AsyncWebCrawler(config=browser_config) as crawler: + # Crawl the URL - You should have access to authenticated content now + result = await crawler.arun(url) + + elapsed_time = asyncio.get_event_loop().time() - start_time + + if result.success: + # Use url_status method for consistent logging + logger.url_status(url, True, elapsed_time, tag="CRAWL") + + # Print page title or some indication of success + title = result.metadata.get("title", "") + logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL") + return result + else: + # Log error status + logger.error_status(url, result.error_message, tag="CRAWL") + return None + + +async def main(): + logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO") + logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO") + + # Choose between interactive mode and automatic mode + mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower() + + if mode == 'i': + # Interactive profile management - use the interactive_manager method + # Pass the crawl_with_profile function as the callback for the "crawl a website" option + await profiler.interactive_manager(crawl_callback=crawl_with_profile) + else: + # Automatic mode - simplified example + profiles = profiler.list_profiles() + + if not profiles: + # Create a new profile if none exists + logger.info("No profiles found. Creating a new one...", tag="DEMO") + profile_path = await profiler.create_profile() + if not profile_path: + logger.error("Cannot proceed without a valid profile", tag="DEMO") + return + else: + # Use the first (most recent) profile + profile_path = profiles[0]["path"] + logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO") + + # Example: Crawl an authenticated page + urls_to_crawl = [ + "https://github.com/settings/profile", # GitHub requires login + # "https://twitter.com/home", # Twitter requires login + # "https://www.linkedin.com/feed/", # LinkedIn requires login + ] + + for url in urls_to_crawl: + await crawl_with_profile(profile_path, url) + + +if __name__ == "__main__": + try: + # Run the async main function + asyncio.run(main()) + except KeyboardInterrupt: + logger.warning("Example interrupted by user", tag="DEMO") + except Exception as e: + logger.error(f"Error in example: {str(e)}", tag="DEMO") \ No newline at end of file diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index 72742bd5..27a1c310 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -1,10 +1,11 @@ -from crawl4ai.async_configs import LlmConfig -from crawl4ai.extraction_strategy import * -from crawl4ai.crawler_strategy import * +from crawl4ai import LLMConfig +from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy import asyncio +import os +import json from pydantic import BaseModel, Field -url = r"https://openai.com/api/pricing/" +url = "https://openai.com/api/pricing/" class OpenAIModelFee(BaseModel): @@ -14,10 +15,6 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) - -from crawl4ai import AsyncWebCrawler - - async def main(): # Use AsyncWebCrawler async with AsyncWebCrawler() as crawler: @@ -26,7 +23,7 @@ async def main(): word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), - llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")), + llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")), schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="From the crawled content, extract all mentioned model names along with their " diff --git a/docs/examples/llm_markdown_generator.py b/docs/examples/llm_markdown_generator.py index f3e18df4..777c59b0 100644 --- a/docs/examples/llm_markdown_generator.py +++ b/docs/examples/llm_markdown_generator.py @@ -1,7 +1,7 @@ import os import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.content_filter_strategy import LLMContentFilter async def test_llm_filter(): @@ -23,7 +23,7 @@ async def test_llm_filter(): # Initialize LLM filter with focused instruction filter = LLMContentFilter( - llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), + llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), instruction=""" Focus on extracting the core educational content about Python classes. Include: @@ -43,7 +43,7 @@ async def test_llm_filter(): ) filter = LLMContentFilter( - llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), + llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 ignore_cache = True, instruction=""" diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index 450880a9..5efb785d 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -1,6 +1,6 @@ import os, sys -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig sys.path.append( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -211,7 +211,7 @@ async def extract_structured_data_using_llm( word_count_threshold=1, page_timeout=80000, extraction_strategy=LLMExtractionStrategy( - llmConfig=LlmConfig(provider=provider,api_token=api_token), + llm_config=LLMConfig(provider=provider,api_token=api_token), schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. @@ -416,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2(): async def cosine_similarity_extraction(): + from crawl4ai.extraction_strategy import CosineStrategy crawl_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=CosineStrategy( @@ -507,6 +508,9 @@ async def ssl_certification(): if result.success and result.ssl_certificate: cert = result.ssl_certificate + tmp_dir = os.path.join(__location__, "tmp") + os.makedirs(tmp_dir, exist_ok=True) + # 1. Access certificate properties directly print("\nCertificate Information:") print(f"Issuer: {cert.issuer.get('CN', '')}") @@ -529,67 +533,6 @@ async def ssl_certification(): print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") -# Speed Comparison -async def speed_comparison(): - print("\n--- Speed Comparison ---") - - # Firecrawl comparison - from firecrawl import FirecrawlApp - - app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) - start = time.time() - scrape_status = app.scrape_url( - "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]} - ) - end = time.time() - print("Firecrawl:") - print(f"Time taken: {end - start:.2f} seconds") - print(f"Content length: {len(scrape_status['markdown'])} characters") - print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}") - print() - - # Crawl4AI comparisons - browser_config = BrowserConfig(headless=True) - - # Simple crawl - async with AsyncWebCrawler(config=browser_config) as crawler: - start = time.time() - result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, word_count_threshold=0 - ), - ) - end = time.time() - print("Crawl4AI (simple crawl):") - print(f"Time taken: {end - start:.2f} seconds") - print(f"Content length: {len(result.markdown)} characters") - print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") - print() - - # Advanced filtering - start = time.time() - result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - word_count_threshold=0, - markdown_generator=DefaultMarkdownGenerator( - content_filter=PruningContentFilter( - threshold=0.48, threshold_type="fixed", min_word_threshold=0 - ) - ), - ), - ) - end = time.time() - print("Crawl4AI (Markdown Plus):") - print(f"Time taken: {end - start:.2f} seconds") - print(f"Content length: {len(result.markdown.raw_markdown)} characters") - print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters") - print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}") - print() - - # Main execution async def main(): # Basic examples diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 0aa930ea..aeb0d20a 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -1,6 +1,6 @@ import os, sys -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig # append parent directory to system path sys.path.append( @@ -147,7 +147,7 @@ async def extract_structured_data_using_llm( url="https://openai.com/api/pricing/", word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( - llmConfig=LlmConfig(provider=provider,api_token=api_token), + llm_config=LLMConfig(provider=provider,api_token=api_token), schema=OpenAIModelFee.model_json_schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. @@ -570,7 +570,7 @@ async def generate_knowledge_graph(): relationships: List[Relationship] extraction_strategy = LLMExtractionStrategy( - llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token" + llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token" schema=KnowledgeGraph.model_json_schema(), extraction_type="schema", instruction="""Extract entities and relationships from the given text.""", diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py index 3be1baf0..78f3e56c 100644 --- a/docs/examples/quickstart_sync.py +++ b/docs/examples/quickstart_sync.py @@ -1,6 +1,6 @@ import os import time -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import * from crawl4ai.extraction_strategy import * @@ -179,7 +179,7 @@ def add_llm_extraction_strategy(crawler): result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( - llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")) + llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")) ), ) cprint( @@ -198,7 +198,7 @@ def add_llm_extraction_strategy(crawler): result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( - llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), + llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), instruction="I am interested in only financial news", ), ) @@ -210,7 +210,7 @@ def add_llm_extraction_strategy(crawler): result = crawler.run( url="https://www.nbcnews.com/business", extraction_strategy=LLMExtractionStrategy( - llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), + llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract only content related to technology", ), ) diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py index 5b2df61e..3cbbdb7b 100644 --- a/docs/examples/tutorial_v0.5.py +++ b/docs/examples/tutorial_v0.5.py @@ -13,11 +13,11 @@ from crawl4ai.deep_crawling import ( ) from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy -from crawl4ai.configs import ProxyConfig +from crawl4ai.proxy_strategy import ProxyConfig from crawl4ai import RoundRobinProxyStrategy from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai import DefaultMarkdownGenerator -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from pprint import pprint @@ -284,9 +284,9 @@ async def llm_content_filter(): PART 5: LLM Content Filter This function demonstrates: - - Configuring LLM providers via LlmConfig + - Configuring LLM providers via LLMConfig - Using LLM to generate focused markdown - - LlmConfig for configuration + - LLMConfig for configuration Note: Requires a valid API key for the chosen LLM provider """ @@ -296,7 +296,7 @@ async def llm_content_filter(): # Create LLM configuration # Replace with your actual API key or set as environment variable - llm_config = LlmConfig( + llm_config = LLMConfig( provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable ) @@ -309,7 +309,7 @@ async def llm_content_filter(): # Create markdown generator with LLM filter markdown_generator = DefaultMarkdownGenerator( content_filter=LLMContentFilter( - llmConfig=llm_config, + llm_config=llm_config, instruction="Extract key concepts and summaries" ) ) @@ -381,7 +381,7 @@ async def llm_schema_generation(): PART 7: LLM Schema Generation This function demonstrates: - - Configuring LLM providers via LlmConfig + - Configuring LLM providers via LLMConfig - Using LLM to generate extraction schemas - JsonCssExtractionStrategy @@ -406,9 +406,9 @@ async def llm_schema_generation():
4.7/5
""" - print("\nπŸ“Š Setting up LlmConfig...") + print("\nπŸ“Š Setting up LLMConfig...") # Create LLM configuration - llm_config = LlmConfig( + llm_config = LLMConfig( provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY" ) @@ -416,7 +416,7 @@ async def llm_schema_generation(): print(" This would use the LLM to analyze HTML and create an extraction schema") schema = JsonCssExtractionStrategy.generate_schema( html=sample_html, - llmConfig = llm_config, + llm_config = llm_config, query="Extract product name and price" ) print("\nβœ… Generated Schema:") diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md index 702d9475..403acb9a 100644 --- a/docs/md_v2/advanced/identity-based-crawling.md +++ b/docs/md_v2/advanced/identity-based-crawling.md @@ -167,13 +167,114 @@ async with AsyncWebCrawler() as crawler: --- -## 6. Summary +## 6. Using the BrowserProfiler Class -- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`. -- **Log in** or configure sites as needed, then close the browser. -- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`. -- Enjoy **persistent** sessions that reflect your real identity. -- If you only need quick, ephemeral automation, **Magic Mode** might suffice. +Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing. + +### Creating and Managing Profiles with BrowserProfiler + +The `BrowserProfiler` class offers a comprehensive API for browser profile management: + +```python +import asyncio +from crawl4ai import BrowserProfiler + +async def manage_profiles(): + # Create a profiler instance + profiler = BrowserProfiler() + + # Create a profile interactively - opens a browser window + profile_path = await profiler.create_profile( + profile_name="my-login-profile" # Optional: name your profile + ) + + print(f"Profile saved at: {profile_path}") + + # List all available profiles + profiles = profiler.list_profiles() + + for profile in profiles: + print(f"Profile: {profile['name']}") + print(f" Path: {profile['path']}") + print(f" Created: {profile['created']}") + print(f" Browser type: {profile['type']}") + + # Get a specific profile path by name + specific_profile = profiler.get_profile_path("my-login-profile") + + # Delete a profile when no longer needed + success = profiler.delete_profile("old-profile-name") + +asyncio.run(manage_profiles()) +``` + +**How profile creation works:** +1. A browser window opens for you to interact with +2. You log in to websites, set preferences, etc. +3. When you're done, press 'q' in the terminal to close the browser +4. The profile is saved in the Crawl4AI profiles directory +5. You can use the returned path with `BrowserConfig.user_data_dir` + +### Interactive Profile Management + +The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion: + +```python +import asyncio +from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig + +# Define a function to use a profile for crawling +async def crawl_with_profile(profile_path, url): + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir=profile_path + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url) + return result + +async def main(): + # Create a profiler instance + profiler = BrowserProfiler() + + # Launch the interactive profile manager + # Passing the crawl function as a callback adds a "crawl with profile" option + await profiler.interactive_manager(crawl_callback=crawl_with_profile) + +asyncio.run(main()) +``` + +### Legacy Methods + +For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class: + +```python +from crawl4ai.browser_manager import ManagedBrowser + +# These methods still work but use BrowserProfiler internally +profiles = ManagedBrowser.list_profiles() +``` + +### Complete Example + +See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class. + +--- + +## 7. Summary + +- **Create** your user-data directory either: + - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` + - Or by using the built-in `BrowserProfiler.create_profile()` method + - Or through the interactive interface with `profiler.interactive_manager()` +- **Log in** or configure sites as needed, then close the browser +- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True` +- **List and reuse** profiles with `BrowserProfiler.list_profiles()` +- **Manage** your profiles with the dedicated `BrowserProfiler` class +- Enjoy **persistent** sessions that reflect your real identity +- If you only need quick, ephemeral automation, **Magic Mode** might suffice **Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary. diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index b8a1a213..b3e4349b 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -71,7 +71,8 @@ We group them by category. | **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | | **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | | **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). | -| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. | +| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. | +| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. | | **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). | | **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. | | **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. | @@ -246,8 +247,8 @@ run_config = CrawlerRunConfig( ) ``` -# 3. **LlmConfig** - Setting up LLM providers -LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following - +# 3. **LLMConfig** - Setting up LLM providers +LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following - 1. LLMExtractionStrategy 2. LLMContentFilter @@ -263,7 +264,7 @@ LlmConfig is useful to pass LLM provider config to strategies and functions that ## 3.2 Example Usage ```python -llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) ``` ## 4. Putting It All Together @@ -271,7 +272,7 @@ llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent. - **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS. - **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`). -- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema` +- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema` ```python # Create a modified copy with the clone() method diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md index fbf7a6ee..45d44950 100644 --- a/docs/md_v2/api/strategies.md +++ b/docs/md_v2/api/strategies.md @@ -131,7 +131,7 @@ OverlappingWindowChunking( ```python from pydantic import BaseModel from crawl4ai.extraction_strategy import LLMExtractionStrategy -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig # Define schema class Article(BaseModel): @@ -141,7 +141,7 @@ class Article(BaseModel): # Create strategy strategy = LLMExtractionStrategy( - llmConfig = LlmConfig(provider="ollama/llama2"), + llm_config = LLMConfig(provider="ollama/llama2"), schema=Article.schema(), instruction="Extract article details" ) @@ -198,7 +198,7 @@ result = await crawler.arun( ```python from crawl4ai.chunking_strategy import OverlappingWindowChunking -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig # Create chunking strategy chunker = OverlappingWindowChunking( @@ -208,7 +208,7 @@ chunker = OverlappingWindowChunking( # Use with extraction strategy strategy = LLMExtractionStrategy( - llmConfig = LlmConfig(provider="ollama/llama2"), + llm_config = LLMConfig(provider="ollama/llama2"), chunking_strategy=chunker ) diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md index ad95c662..1eed43d9 100644 --- a/docs/md_v2/blog/index.md +++ b/docs/md_v2/blog/index.md @@ -16,7 +16,7 @@ My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5 * **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks. * **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication. * **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands. -* **LLM Configuration (`LlmConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models. +* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models. **Minor Updates & Improvements:** @@ -47,7 +47,7 @@ This release includes several breaking changes to improve the library's structur * **Config**: FastFilterChain has been replaced with FilterChain * **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] * **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations -* **LLM Parameters:** Use the new `LlmConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`. +* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`. **In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide. diff --git a/docs/md_v2/blog/releases/0.5.0.md b/docs/md_v2/blog/releases/0.5.0.md index 8b349dfe..24b0feda 100644 --- a/docs/md_v2/blog/releases/0.5.0.md +++ b/docs/md_v2/blog/releases/0.5.0.md @@ -251,7 +251,7 @@ from crawl4ai import ( RoundRobinProxyStrategy, ) import asyncio -from crawl4ai.configs import ProxyConfig +from crawl4ai.proxy_strategy import ProxyConfig async def main(): # Load proxies and create rotation strategy proxies = ProxyConfig.from_env() @@ -305,13 +305,13 @@ asyncio.run(main()) ```python from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import LLMContentFilter -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig import asyncio -llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") +llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") markdown_generator = DefaultMarkdownGenerator( - content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries") + content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries") ) config = CrawlerRunConfig(markdown_generator=markdown_generator) @@ -335,13 +335,13 @@ asyncio.run(main()) ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig -llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") +llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") schema = JsonCssExtractionStrategy.generate_schema( html="

Product Name

$99
", - llmConfig = llm_config, + llm_config = llm_config, query="Extract product name and price" ) print(schema) @@ -394,20 +394,20 @@ print(schema) serialization, especially for sets of allowed/blocked domains. No code changes required. -- **Added: New `LlmConfig` parameter.** This new parameter can be passed for +- **Added: New `LLMConfig` parameter.** This new parameter can be passed for extraction, filtering, and schema generation tasks. It simplifies passing provider strings, API tokens, and base URLs across all sections where LLM configuration is necessary. It also enables reuse and allows for quick experimentation between different LLM configurations. ```python - from crawl4ai.async_configs import LlmConfig + from crawl4ai import LLMConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import AsyncWebCrawler, CrawlerRunConfig - # Example of using LlmConfig with LLMExtractionStrategy - llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY") - strategy = LLMExtractionStrategy(llmConfig=llm_config, schema=...) + # Example of using LLMConfig with LLMExtractionStrategy + llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY") + strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...) # Example usage within a crawler async with AsyncWebCrawler() as crawler: @@ -418,7 +418,7 @@ print(schema) ``` **Breaking Change:** Removed old parameters like `provider`, `api_token`, `base_url`, and `api_base` from `LLMExtractionStrategy` and - `LLMContentFilter`. Users should migrate to using the `LlmConfig` object. + `LLMContentFilter`. Users should migrate to using the `LLMConfig` object. - **Changed: Improved browser context management and added shared data support. (Breaking Change:** `BrowserContext` API updated). Browser contexts are now diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 33ef81ca..0d97e0fc 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -4,7 +4,7 @@ Crawl4AI’s flexibility stems from two key classes: 1.β€€**`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent). 2.β€€**`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). -3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.) +3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.) In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md). @@ -239,7 +239,7 @@ The `clone()` method: -## 3. LlmConfig Essentials +## 3. LLMConfig Essentials ### Key fields to note @@ -256,16 +256,16 @@ The `clone()` method: - If your provider has a custom endpoint ```python -llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) ``` ## 4. Putting It All Together -In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each call’s needs: +In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs: ```python import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai.extraction_strategy import JsonCssExtractionStrategy async def main(): @@ -289,14 +289,14 @@ async def main(): # 3) Example LLM content filtering - gemini_config = LlmConfig( + gemini_config = LLMConfig( provider="gemini/gemini-1.5-pro" api_token = "env:GEMINI_API_TOKEN" ) # Initialize LLM filter with specific instruction filter = LLMContentFilter( - llmConfig=gemini_config, # or your preferred provider + llm_config=gemini_config, # or your preferred provider instruction=""" Focus on extracting the core educational content. Include: @@ -343,7 +343,7 @@ if __name__ == "__main__": For a **detailed list** of available parameters (including advanced ones), see: -- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md) +- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md) You can explore topics like: @@ -356,7 +356,7 @@ You can explore topics like: ## 6. Conclusion -**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define: +**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define: - **Which** browser to launch, how it should run, and any proxy or user agent needs. - **How** each crawl should behaveβ€”caching, timeouts, JavaScript code, extraction strategies, etc. diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md index 9f145852..07c8861b 100644 --- a/docs/md_v2/core/content-selection.md +++ b/docs/md_v2/core/content-selection.md @@ -8,6 +8,10 @@ Below, we show how to configure these parameters and combine them for precise co ## 1. CSS-Based Selection +There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`. + +### 1.1 Using `css_selector` + A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**: ```python @@ -32,6 +36,33 @@ if __name__ == "__main__": **Result**: Only elements matching that selector remain in `result.cleaned_html`. +### 1.2 Using `target_elements` + +The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + config = CrawlerRunConfig( + # Target article body and sidebar, but not other content + target_elements=["article.main-content", "aside.sidebar"] + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/blog-post", + config=config + ) + print("Markdown focused on target elements") + print("Links from entire page still available:", len(result.links.get("internal", []))) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection. + --- ## 2. Content Filtering & Exclusions @@ -211,7 +242,7 @@ if __name__ == "__main__": import asyncio import json from pydantic import BaseModel, Field -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai.extraction_strategy import LLMExtractionStrategy class ArticleData(BaseModel): @@ -220,7 +251,7 @@ class ArticleData(BaseModel): async def main(): llm_strategy = LLMExtractionStrategy( - llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY") + llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY") schema=ArticleData.schema(), extraction_type="schema", instruction="Extract 'headline' and a short 'summary' from the content." @@ -404,15 +435,59 @@ Stick to BeautifulSoup strategy (default) when: --- -## 7. Conclusion +## 7. Combining CSS Selection Methods -By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include: +You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output: -1.β€€**`css_selector`** – Basic scoping to an element or region. -2.β€€**`word_count_threshold`** – Skip short blocks. -3.β€€**`excluded_tags`** – Remove entire HTML tags. -4.β€€**`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains. -5.β€€**`exclude_external_images`** – Remove images from external sources. -6.β€€**`process_iframes`** – Merge iframe content if needed. +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + # Target specific content but preserve page context + config = CrawlerRunConfig( + # Focus markdown on main content and sidebar + target_elements=["#main-content", ".sidebar"], + + # Global filters applied to entire page + excluded_tags=["nav", "footer", "header"], + exclude_external_links=True, + + # Use basic content thresholds + word_count_threshold=15, + + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/article", + config=config + ) + + print(f"Content focuses on specific elements, but all links still analyzed") + print(f"Internal links: {len(result.links.get('internal', []))}") + print(f"External links: {len(result.links.get('external', []))}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +This approach gives you the best of both worlds: +- Markdown generation and content extraction focus on the elements you care about +- Links, images and other page data still give you the full context of the page +- Content filtering still applies globally + +## 8. Conclusion + +By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include: + +1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media. +2. **`css_selector`** – Basic scoping to an element or region for all extraction processes. +3. **`word_count_threshold`** – Skip short blocks. +4. **`excluded_tags`** – Remove entire HTML tags. +5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains. +6. **`exclude_external_images`** – Remove images from external sources. +7. **`process_iframes`** – Merge iframe content if needed. Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max! \ No newline at end of file diff --git a/docs/md_v2/core/deep-crawling.md b/docs/md_v2/core/deep-crawling.md index 9766208a..00834787 100644 --- a/docs/md_v2/core/deep-crawling.md +++ b/docs/md_v2/core/deep-crawling.md @@ -73,12 +73,18 @@ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy strategy = BFSDeepCrawlStrategy( max_depth=2, # Crawl initial page + 2 levels deep include_external=False, # Stay within the same domain + max_pages=50, # Maximum number of pages to crawl (optional) + score_threshold=0.3, # Minimum score for URLs to be crawled (optional) ) ``` **Key parameters:** - **`max_depth`**: Number of levels to crawl beyond the starting page - **`include_external`**: Whether to follow links to other domains +- **`max_pages`**: Maximum number of pages to crawl (default: infinite) +- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf) +- **`filter_chain`**: FilterChain instance for URL filtering +- **`url_scorer`**: Scorer instance for evaluating URLs ### 2.2 DFSDeepCrawlStrategy (Depth-First Search) @@ -91,12 +97,18 @@ from crawl4ai.deep_crawling import DFSDeepCrawlStrategy strategy = DFSDeepCrawlStrategy( max_depth=2, # Crawl initial page + 2 levels deep include_external=False, # Stay within the same domain + max_pages=30, # Maximum number of pages to crawl (optional) + score_threshold=0.5, # Minimum score for URLs to be crawled (optional) ) ``` **Key parameters:** - **`max_depth`**: Number of levels to crawl beyond the starting page - **`include_external`**: Whether to follow links to other domains +- **`max_pages`**: Maximum number of pages to crawl (default: infinite) +- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf) +- **`filter_chain`**: FilterChain instance for URL filtering +- **`url_scorer`**: Scorer instance for evaluating URLs ### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy) @@ -116,7 +128,8 @@ scorer = KeywordRelevanceScorer( strategy = BestFirstCrawlingStrategy( max_depth=2, include_external=False, - url_scorer=scorer + url_scorer=scorer, + max_pages=25, # Maximum number of pages to crawl (optional) ) ``` @@ -124,6 +137,8 @@ This crawling approach: - Evaluates each discovered URL based on scorer criteria - Visits higher-scoring pages first - Helps focus crawl resources on the most relevant content +- Can limit total pages crawled with `max_pages` +- Does not need `score_threshold` as it naturally prioritizes by score --- @@ -410,27 +425,64 @@ if __name__ == "__main__": --- -## 8. Common Pitfalls & Tips +## 8. Limiting and Controlling Crawl Size -1.**Set realistic depth limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. +### 8.1 Using max_pages + +You can limit the total number of pages crawled with the `max_pages` parameter: + +```python +# Limit to exactly 20 pages regardless of depth +strategy = BFSDeepCrawlStrategy( + max_depth=3, + max_pages=20 +) +``` + +This feature is useful for: +- Controlling API costs +- Setting predictable execution times +- Focusing on the most important content +- Testing crawl configurations before full execution + +### 8.2 Using score_threshold + +For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages: + +```python +# Only follow links with scores above 0.4 +strategy = DFSDeepCrawlStrategy( + max_depth=2, + url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]), + score_threshold=0.4 # Skip URLs with scores below this value +) +``` + +Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first. + +## 9. Common Pitfalls & Tips + +1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits. 2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization. 3.**Be a good web citizen.** Respect robots.txt. (disabled by default) +4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results. -4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.success` and `result.error_message` when processing results. +5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling. --- -## 9. Summary & Next Steps +## 10. Summary & Next Steps In this **Deep Crawling with Crawl4AI** tutorial, you learned to: -- Configure **BFSDeepCrawlStrategy** and **BestFirstCrawlingStrategy** +- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy** - Process results in streaming or non-streaming mode - Apply filters to target specific content - Use scorers to prioritize the most relevant pages +- Limit crawls with `max_pages` and `score_threshold` parameters - Build a complete advanced crawler with combined techniques With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case. diff --git a/docs/md_v2/core/link-media.md b/docs/md_v2/core/link-media.md index ed56e8fb..cccc8df0 100644 --- a/docs/md_v2/core/link-media.md +++ b/docs/md_v2/core/link-media.md @@ -133,19 +133,28 @@ This approach is handy when you still want external links but need to block cert ### 3.1 Accessing `result.media` -By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`). +By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`). **Basic Example**: ```python if result.success: + # Get images images_info = result.media.get("images", []) print(f"Found {len(images_info)} images in total.") - for i, img in enumerate(images_info[:5]): # Inspect just the first 5 + for i, img in enumerate(images_info[:3]): # Inspect just the first 3 print(f"[Image {i}] URL: {img['src']}") print(f" Alt text: {img.get('alt', '')}") print(f" Score: {img.get('score')}") print(f" Description: {img.get('desc', '')}\n") + + # Get tables + tables = result.media.get("tables", []) + print(f"Found {len(tables)} data tables in total.") + for i, table in enumerate(tables): + print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}") + print(f" Columns: {len(table.get('headers', []))}") + print(f" Rows: {len(table.get('rows', []))}") ``` **Structure Example**: @@ -171,6 +180,19 @@ result.media = { ], "audio": [ # Similar structure but with audio-specific fields + ], + "tables": [ + { + "headers": ["Name", "Age", "Location"], + "rows": [ + ["John Doe", "34", "New York"], + ["Jane Smith", "28", "San Francisco"], + ["Alex Johnson", "42", "Chicago"] + ], + "caption": "Employee Directory", + "summary": "Directory of company employees" + }, + # More tables if present ] } ``` @@ -199,7 +221,53 @@ crawler_cfg = CrawlerRunConfig( This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling. -### 3.3 Additional Media Config +### 3.3 Working with Tables + +Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including: + +- Presence of thead and tbody sections +- Use of th elements for headers +- Column consistency +- Text density +- And other factors + +Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`. + +**Accessing Table Data**: + +```python +if result.success: + tables = result.media.get("tables", []) + print(f"Found {len(tables)} data tables on the page") + + if tables: + # Access the first table + first_table = tables[0] + print(f"Table caption: {first_table.get('caption', 'No caption')}") + print(f"Headers: {first_table.get('headers', [])}") + + # Print the first 3 rows + for i, row in enumerate(first_table.get('rows', [])[:3]): + print(f"Row {i+1}: {row}") +``` + +**Configuring Table Extraction**: + +You can adjust the sensitivity of the table detection algorithm with: + +```python +crawler_cfg = CrawlerRunConfig( + table_score_threshold=5 # Lower value = more tables detected (default: 7) +) +``` + +Each extracted table contains: +- `headers`: Column header names +- `rows`: List of rows, each containing cell values +- `caption`: Table caption text (if available) +- `summary`: Table summary attribute (if specified) + +### 3.4 Additional Media Config - **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`. - **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`. @@ -273,4 +341,11 @@ if __name__ == "__main__": --- -**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project. \ No newline at end of file +**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project. +### Table Extraction Tips + +- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables. +- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped. +- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7). + +The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting. diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md index c76bdd4a..ac27e5b2 100644 --- a/docs/md_v2/core/markdown-generation.md +++ b/docs/md_v2/core/markdown-generation.md @@ -175,13 +175,13 @@ prune_filter = PruningContentFilter( For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure: ```python -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig from crawl4ai.content_filter_strategy import LLMContentFilter async def main(): # Initialize LLM filter with specific instruction filter = LLMContentFilter( - llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable + llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable instruction=""" Focus on extracting the core educational content. Include: diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md index 40786b78..de0b7e5e 100644 --- a/docs/md_v2/core/quickstart.md +++ b/docs/md_v2/core/quickstart.md @@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig # Generate a schema (one-time cost) html = "

Gaming Laptop

$999.99
" @@ -136,13 +136,13 @@ html = "

Gaming Laptop

$999.99=13.9.4", "cssselect>=1.2.0", - "httpx==0.27.2", + "httpx>=0.27.2", "fake-useragent>=2.0.3", "click>=8.1.7", "pyperclip>=1.8.2", "faust-cchardet>=2.1.19", - "aiohttp>=3.11.11" + "aiohttp>=3.11.11", + "humanize>=4.10.0" ] classifiers = [ "Development Status :: 4 - Beta", @@ -77,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main" crawl4ai-migrate = "crawl4ai.migrations:main" crawl4ai-setup = "crawl4ai.install:post_install" crawl4ai-doctor = "crawl4ai.install:doctor" -crwl = "crawl4ai.cli:cli" +crwl = "crawl4ai.cli:main" [tool.setuptools] packages = {find = {where = ["."], include = ["crawl4ai*"]}} diff --git a/tests/20241401/test_llm_filter.py b/tests/20241401/test_llm_filter.py index 715301f0..6211c429 100644 --- a/tests/20241401/test_llm_filter.py +++ b/tests/20241401/test_llm_filter.py @@ -1,7 +1,7 @@ import os import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.content_filter_strategy import LLMContentFilter async def test_llm_filter(): @@ -23,7 +23,7 @@ async def test_llm_filter(): # Initialize LLM filter with focused instruction filter = LLMContentFilter( - llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), + llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), instruction=""" Focus on extracting the core educational content about Python classes. Include: @@ -43,7 +43,7 @@ async def test_llm_filter(): ) filter = LLMContentFilter( - llmConfig = LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), + llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')), chunk_token_threshold=2 ** 12 * 2, # 2048 * 2 instruction=""" Extract the main educational content while preserving its original wording and substance completely. Your task is to: diff --git a/tests/async/test_chunking_and_extraction_strategies.py b/tests/async/test_chunking_and_extraction_strategies.py index de0bd098..90e17a9d 100644 --- a/tests/async/test_chunking_and_extraction_strategies.py +++ b/tests/async/test_chunking_and_extraction_strategies.py @@ -7,7 +7,7 @@ import json parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(parent_dir) -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.chunking_strategy import RegexChunking from crawl4ai.extraction_strategy import LLMExtractionStrategy @@ -49,7 +49,7 @@ async def test_llm_extraction_strategy(): async with AsyncWebCrawler(verbose=True) as crawler: url = "https://www.nbcnews.com/business" extraction_strategy = LLMExtractionStrategy( - llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), + llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")), instruction="Extract only content related to technology", ) result = await crawler.arun( diff --git a/tests/browser/test_launch_standalone.py b/tests/browser/test_launch_standalone.py new file mode 100644 index 00000000..d60b12f3 --- /dev/null +++ b/tests/browser/test_launch_standalone.py @@ -0,0 +1,17 @@ +from crawl4ai.browser_profiler import BrowserProfiler +import asyncio + + +if __name__ == "__main__": + # Test launching a standalone browser + async def test_standalone_browser(): + profiler = BrowserProfiler() + cdp_url = await profiler.launch_standalone_browser( + browser_type="chromium", + user_data_dir="~/.crawl4ai/browser_profile/test-browser-data", + debugging_port=9222, + headless=False + ) + print(f"CDP URL: {cdp_url}") + + asyncio.run(test_standalone_browser()) \ No newline at end of file diff --git a/tests/docker/test_docker.py b/tests/docker/test_docker.py index 3a95fb31..cf95671e 100644 --- a/tests/docker/test_docker.py +++ b/tests/docker/test_docker.py @@ -7,7 +7,7 @@ from crawl4ai import ( BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator, PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode ) -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.docker_client import Crawl4aiDockerClient class Crawl4AiTester: @@ -143,7 +143,7 @@ async def test_with_client(): cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( content_filter=LLMContentFilter( - llmConfig=LlmConfig(provider="openai/gpt-40"), + llm_config=LLMConfig(provider="openai/gpt-40"), instruction="Extract key technical concepts" ) ), diff --git a/tests/docker/test_serialization.py b/tests/docker/test_serialization.py index 3e54ea11..6ce80005 100644 --- a/tests/docker/test_serialization.py +++ b/tests/docker/test_serialization.py @@ -2,7 +2,7 @@ import inspect from typing import Any, Dict from enum import Enum -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig def to_serializable_dict(obj: Any) -> Dict: """ @@ -224,7 +224,7 @@ if __name__ == "__main__": config3 = CrawlerRunConfig( markdown_generator=DefaultMarkdownGenerator( content_filter=LLMContentFilter( - llmConfig = LlmConfig(provider="openai/gpt-4"), + llm_config = LLMConfig(provider="openai/gpt-4"), instruction="Extract key technical concepts", chunk_token_threshold=2000, overlap_rate=0.1 diff --git a/tests/test_web_crawler.py b/tests/test_web_crawler.py index 07a380fe..b8453192 100644 --- a/tests/test_web_crawler.py +++ b/tests/test_web_crawler.py @@ -1,5 +1,5 @@ import unittest, os -from crawl4ai.async_configs import LlmConfig +from crawl4ai import LLMConfig from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import ( RegexChunking, @@ -43,7 +43,7 @@ class TestWebCrawler(unittest.TestCase): word_count_threshold=5, chunking_strategy=FixedLengthWordChunking(chunk_size=100), extraction_strategy=LLMExtractionStrategy( - llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")) + llm_config=LLMConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")) ), bypass_cache=True, )