diff --git a/.gitignore b/.gitignore index d485815c..70d56988 100644 --- a/.gitignore +++ b/.gitignore @@ -208,7 +208,7 @@ git_issues.md .next/ .tests/ -.issues/ +# .issues/ .docs/ .issues/ .gitboss/ @@ -218,4 +218,5 @@ manage-collab.sh publish.sh combine.sh combined_output.txt -tree.md \ No newline at end of file +tree.md +.scripts \ No newline at end of file diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 39b6e690..4d85bc8f 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -11,6 +11,7 @@ from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy +from typing import Union, List class BrowserConfig: @@ -39,8 +40,8 @@ class BrowserConfig: Default: None. proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. - viewport_width (int): Default viewport width for pages. Default: 1920. - viewport_height (int): Default viewport height for pages. Default: 1080. + viewport_width (int): Default viewport width for pages. Default: 1080. + viewport_height (int): Default viewport height for pages. Default: 600. verbose (bool): Enable verbose logging. Default: True. accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path. @@ -79,7 +80,7 @@ class BrowserConfig: chrome_channel: str = "chrome", proxy: str = None, proxy_config: dict = None, - viewport_width: int = 800, + viewport_width: int = 1080, viewport_height: int = 600, accept_downloads: bool = False, downloads_path: str = None, @@ -136,10 +137,15 @@ class BrowserConfig: self.debugging_port = debugging_port user_agenr_generator = UserAgentGenerator() - if self.user_agent_mode != "random": + if self.user_agent_mode != "random" and self.user_agent_generator_config: self.user_agent = user_agenr_generator.generate( **(self.user_agent_generator_config or {}) ) + elif self.user_agent_mode == "random": + self.user_agent = user_agenr_generator.generate() + else: + pass + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) @@ -158,8 +164,8 @@ class BrowserConfig: chrome_channel=kwargs.get("chrome_channel", "chrome"), proxy=kwargs.get("proxy"), proxy_config=kwargs.get("proxy_config"), - viewport_width=kwargs.get("viewport_width", 1920), - viewport_height=kwargs.get("viewport_height", 1080), + viewport_width=kwargs.get("viewport_width", 1080), + viewport_height=kwargs.get("viewport_height", 600), accept_downloads=kwargs.get("accept_downloads", False), downloads_path=kwargs.get("downloads_path"), storage_state=kwargs.get("storage_state"), @@ -215,6 +221,8 @@ class CrawlerRunConfig: Default: False. prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output. Default: False. + parser_type (str): Type of parser to use for HTML parsing. + Default: "lxml". # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. @@ -322,6 +330,7 @@ class CrawlerRunConfig: keep_data_attributes: bool = False, remove_forms: bool = False, prettiify: bool = False, + parser_type: str = "lxml", # SSL Parameters fetch_ssl_certificate: bool = False, @@ -345,7 +354,7 @@ class CrawlerRunConfig: semaphore_count: int = 5, # Page Interaction Parameters - js_code=None, + js_code: Union[str, List[str]] = None, js_only: bool = False, ignore_body_visibility: bool = True, scan_full_page: bool = False, @@ -393,6 +402,7 @@ class CrawlerRunConfig: self.keep_data_attributes = keep_data_attributes self.remove_forms = remove_forms self.prettiify = prettiify + self.parser_type = parser_type # SSL Parameters self.fetch_ssl_certificate = fetch_ssl_certificate @@ -478,6 +488,7 @@ class CrawlerRunConfig: keep_data_attributes=kwargs.get("keep_data_attributes", False), remove_forms=kwargs.get("remove_forms", False), prettiify=kwargs.get("prettiify", False), + parser_type=kwargs.get("parser_type", "lxml"), # SSL Parameters fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), @@ -550,6 +561,7 @@ class CrawlerRunConfig: "keep_data_attributes": self.keep_data_attributes, "remove_forms": self.remove_forms, "prettiify": self.prettiify, + "parser_type": self.parser_type, "fetch_ssl_certificate": self.fetch_ssl_certificate, "cache_mode": self.cache_mode, "session_id": self.session_id, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 045fef64..32bd14b8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2,7 +2,7 @@ import asyncio import base64 import time from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable +from typing import Callable, Dict, Any, List, Optional, Awaitable, Union import os, sys, shutil import tempfile, subprocess from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext @@ -64,6 +64,36 @@ BROWSER_DISABLE_OPTIONS = [ class ManagedBrowser: + """ + Manages the browser process and context. This class allows to connect to the browser using CDP protocol. + + Attributes: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + browser_process (subprocess.Popen): The process object for the browser. + temp_dir (str): Temporary directory for user data if not provided. + debugging_port (int): Port for debugging the browser. + host (str): Host for debugging the browser. + + Methods: + start(): Starts the browser process and returns the CDP endpoint URL. + _get_browser_path(): Returns the browser executable path based on OS and browser type. + _get_browser_args(): Returns browser-specific command line arguments. + _get_user_data_dir(): Returns the user data directory path. + _cleanup(): Terminates the browser process and removes the temporary directory. + """ + + browser_type: str + user_data_dir: str + headless: bool + browser_process: subprocess.Popen + temp_dir: str + debugging_port: int + host: str def __init__( self, browser_type: str = "chromium", @@ -73,6 +103,20 @@ class ManagedBrowser: host: str = "localhost", debugging_port: int = 9222, ): + """ + Initialize the ManagedBrowser instance. + + Args: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + logger (logging.Logger): Logger instance for logging messages. Default: None. + host (str): Host for debugging the browser. Default: "localhost". + debugging_port (int): Port for debugging the browser. Default: 9222. + """ self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless @@ -112,7 +156,17 @@ class ManagedBrowser: raise Exception(f"Failed to start browser: {e}") async def _monitor_browser_process(self): - """Monitor the browser process for unexpected termination.""" + """ + Monitor the browser process for unexpected termination. + + How it works: + 1. Read stdout and stderr from the browser process. + 2. If the process has terminated, log the error message and terminate the browser. + 3. If the shutting_down flag is set, log the normal termination message. + 4. If any other error occurs, log the error message. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ if self.browser_process: try: stdout, stderr = await asyncio.gather( @@ -233,6 +287,19 @@ class ManagedBrowser: class BrowserManager: + """ + Manages the browser instance and context. + + Attributes: + config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + browser (Browser): The browser instance + default_context (BrowserContext): The default browser context + managed_browser (ManagedBrowser): The managed browser instance + playwright (Playwright): The Playwright instance + sessions (dict): Dictionary to store session information + session_ttl (int): Session timeout in seconds + """ def __init__(self, browser_config: BrowserConfig, logger=None): """ Initialize the BrowserManager with a browser configuration. @@ -265,7 +332,17 @@ class BrowserManager: ) async def start(self): - """Start the browser instance and set up the default context.""" + """ + Start the browser instance and set up the default context. + + How it works: + 1. Check if Playwright is already initialized. + 2. If not, initialize Playwright. + 3. If managed browser is used, start it and connect to the CDP endpoint. + 4. If managed browser is not used, launch the browser and set up the default context. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ if self.playwright is None: from playwright.async_api import async_playwright @@ -382,7 +459,34 @@ class BrowserManager: crawlerRunConfig: CrawlerRunConfig, is_default=False, ): - """Set up a browser context with the configured options.""" + """ + Set up a browser context with the configured options. + + How it works: + 1. Set extra HTTP headers if provided. + 2. Add cookies if provided. + 3. Load storage state if provided. + 4. Accept downloads if enabled. + 5. Set default timeouts for navigation and download. + 6. Set user agent if provided. + 7. Set browser hints if provided. + 8. Set proxy if provided. + 9. Set downloads path if provided. + 10. Set storage state if provided. + 11. Set cache if provided. + 12. Set extra HTTP headers if provided. + 13. Add cookies if provided. + 14. Set default timeouts for navigation and download if enabled. + 15. Set user agent if provided. + 16. Set browser hints if provided. + + Args: + context (BrowserContext): The browser context to set up + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + is_default (bool): Flag indicating if this is the default context + Returns: + None + """ if self.config.headers: await context.set_extra_http_headers(self.config.headers) @@ -489,7 +593,16 @@ class BrowserManager: # async def get_page(self, session_id: Optional[str], user_agent: str): async def get_page(self, crawlerRunConfig: CrawlerRunConfig): - """Get a page for the given session ID, creating a new one if needed.""" + """ + Get a page for the given session ID, creating a new one if needed. + + Args: + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + + Returns: + Page: The page object for the given session ID. + BrowserContext: The browser context for the given session ID. + """ self._cleanup_expired_sessions() if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: @@ -511,7 +624,12 @@ class BrowserManager: return page, context async def kill_session(self, session_id: str): - """Kill a browser session and clean up resources.""" + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The session ID to kill. + """ if session_id in self.sessions: context, page, _ = self.sessions[session_id] await page.close() @@ -554,16 +672,44 @@ class BrowserManager: class AsyncCrawlerStrategy(ABC): + """ + Abstract base class for crawler strategies. + Subclasses must implement the crawl method. + """ @abstractmethod async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: pass # 4 + 3 - @abstractmethod - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - pass class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + """ + Crawler strategy using Playwright. + + Attributes: + browser_config (BrowserConfig): Configuration object containing browser settings. + logger (AsyncLogger): Logger instance for recording events and errors. + _downloaded_files (List[str]): List of downloaded file paths. + hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior. + browser_manager (BrowserManager): Manager for browser creation and management. + + Methods: + __init__(self, browser_config=None, logger=None, **kwargs): + Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. + __aenter__(self): + Start the browser and initialize the browser manager. + __aexit__(self, exc_type, exc_val, exc_tb): + Close the browser and clean up resources. + start(self): + Start the browser and initialize the browser manager. + close(self): + Close the browser and clean up resources. + kill_session(self, session_id): + Kill a browser session and clean up resources. + crawl(self, url, **kwargs): + Run the crawler for a single URL. + + """ def __init__( self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs ): @@ -608,6 +754,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.close() async def start(self): + """ + Start the browser and initialize the browser manager. + """ await self.browser_manager.start() await self.execute_hook( "on_browser_created", @@ -616,9 +765,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) async def close(self): + """ + Close the browser and clean up resources. + """ await self.browser_manager.close() async def kill_session(self, session_id: str): + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The ID of the session to kill. + + Returns: + None + """ # Log a warning message and no need kill session, in new version auto kill session self.logger.warning( message="Session auto-kill is enabled in the new version. No need to manually kill sessions.", @@ -627,12 +788,43 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.browser_manager.kill_session(session_id) def set_hook(self, hook_type: str, hook: Callable): + """ + Set a hook function for a specific hook type. Following are list of hook types: + - on_browser_created: Called when a new browser instance is created. + - on_page_context_created: Called when a new page context is created. + - on_user_agent_updated: Called when the user agent is updated. + - on_execution_started: Called when the execution starts. + - before_goto: Called before a goto operation. + - after_goto: Called after a goto operation. + - before_return_html: Called before returning HTML content. + - before_retrieve_html: Called before retrieving HTML content. + + All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs. + + Args: + hook_type (str): The type of the hook. + hook (Callable): The hook function to set. + + Returns: + None + """ if hook_type in self.hooks: self.hooks[hook_type] = hook else: raise ValueError(f"Invalid hook type: {hook_type}") async def execute_hook(self, hook_type: str, *args, **kwargs): + """ + Execute a hook function for a specific hook type. + + Args: + hook_type (str): The type of the hook. + *args: Variable length positional arguments. + **kwargs: Keyword arguments. + + Returns: + The return value of the hook function, if any. + """ hook = self.hooks.get(hook_type) if hook: if asyncio.iscoroutinefunction(hook): @@ -642,12 +834,47 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return args[0] if args else None def update_user_agent(self, user_agent: str): + """ + Update the user agent for the browser. + + Args: + user_agent (str): The new user agent string. + + Returns: + None + """ self.user_agent = user_agent def set_custom_headers(self, headers: Dict[str, str]): + """ + Set custom headers for the browser. + + Args: + headers (Dict[str, str]): A dictionary of headers to set. + + Returns: + None + """ self.headers = headers async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + """ + Wait for a condition in a smart way. This functions works as below: + + 1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true. + 2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present. + 3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true. + 4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present. + + This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl(). + Args: + page: Playwright page object + wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'. + timeout (float): Maximum time to wait in milliseconds + + Returns: + None + """ wait_for = wait_for.strip() if wait_for.startswith("js:"): @@ -694,33 +921,60 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "or explicitly prefixed with 'js:' or 'css:'." ) - async def csp_compliant_wait( - self, page: Page, user_wait_function: str, timeout: float = 30000 - ): + async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 ): + """ + Wait for a condition in a CSP-compliant way. + + Args: + page: Playwright page object + user_wait_function: JavaScript function as string that returns boolean + timeout: Maximum time to wait in milliseconds + + Returns: + bool: True if condition was met, False if timed out + + Raises: + RuntimeError: If there's an error evaluating the condition + """ wrapper_js = f""" async () => {{ const userFunction = {user_wait_function}; const startTime = Date.now(); - while (true) {{ - if (await userFunction()) {{ - return true; + try {{ + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + return false; // Return false instead of throwing + }} + await new Promise(resolve => setTimeout(resolve, 100)); }} - if (Date.now() - startTime > {timeout}) {{ - throw new Error('Timeout waiting for condition'); - }} - await new Promise(resolve => setTimeout(resolve, 100)); + }} catch (error) {{ + throw new Error(`Error evaluating condition: ${{error.message}}`); }} }} """ try: - await page.evaluate(wrapper_js) - except TimeoutError: - raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") + result = await page.evaluate(wrapper_js) + return result except Exception as e: - raise RuntimeError(f"Error in wait condition: {str(e)}") + if "Error evaluating condition" in str(e): + raise RuntimeError(f"Failed to evaluate wait condition: {str(e)}") + # For timeout or other cases, just return False + return False async def process_iframes(self, page): + """ + Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content. + + Args: + page: Playwright page object + + Returns: + Playwright page object + """ # Find all iframes iframes = await page.query_selector_all("iframe") @@ -776,7 +1030,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return page async def create_session(self, **kwargs) -> str: - """Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls.""" + """ + Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls. + This function is asynchronous and returns a string representing the session ID. + + Args: + **kwargs: Optional keyword arguments to configure the session. + + Returns: + str: The session ID. + """ await self.start() session_id = kwargs.get("session_id") or str(uuid.uuid4()) @@ -786,9 +1049,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page, context = await self.browser_manager.get_page(session_id, user_agent) return session_id - async def crawl( - self, url: str, config: CrawlerRunConfig, **kwargs - ) -> AsyncCrawlResponse: + async def crawl( self, url: str, config: CrawlerRunConfig, **kwargs ) -> AsyncCrawlResponse: """ Crawls a given URL or processes raw HTML/local file content based on the URL prefix. @@ -796,7 +1057,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): url (str): The URL to crawl. Supported prefixes: - 'http://' or 'https://': Web URL to crawl. - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. + - 'raw://': Raw HTML content to process. **kwargs: Additional parameters: - 'screenshot' (bool): Whether to take a screenshot. - ... [other existing parameters] @@ -829,9 +1090,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): get_delayed_content=None, ) - elif url.startswith("raw:"): + elif url.startswith("raw:") or url.startswith("raw://"): # Process raw HTML content - raw_html = url[4:] # Remove 'raw:' prefix + raw_html = url[4:] if url[:4] == "raw:" else url[7:] html = raw_html if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) @@ -847,9 +1108,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "URL must start with 'http://', 'https://', 'file://', or 'raw:'" ) - async def _crawl_web( - self, url: str, config: CrawlerRunConfig - ) -> AsyncCrawlResponse: + async def _crawl_web( self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse: """ Internal method to crawl web URLs with the specified configuration. @@ -931,6 +1190,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook("before_goto", page, context=context, url=url) try: + # Generate a unique nonce for this request + nonce = hashlib.sha256(os.urandom(32)).hexdigest() + + # Add CSP headers to the request + await page.set_extra_http_headers({ + 'Content-Security-Policy': f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" + }) + response = await page.goto( url, wait_until=config.wait_until, timeout=config.page_timeout ) @@ -953,35 +1220,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Wait for body element and visibility try: await page.wait_for_selector("body", state="attached", timeout=30000) - await page.wait_for_function( - """ - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - } - """, - timeout=30000, - ) - except Error as e: - visibility_info = await page.evaluate( - """ - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return { - display: style.display, - visibility: style.visibility, - opacity: style.opacity, - hasContent: body.innerHTML.length, - classList: Array.from(body.classList) - } - } - """ + + # Use the new check_visibility function with csp_compliant_wait + is_visible = await self.csp_compliant_wait( + page, + """() => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + }""", + timeout=30000 ) + + if not is_visible and not config.ignore_body_visibility: + visibility_info = await self.check_visibility(page) + raise Error(f"Body element is hidden: {visibility_info}") + except Error as e: + visibility_info = await self.check_visibility(page) + if self.config.verbose: self.logger.debug( message="Body visibility info: {info}", @@ -990,7 +1251,50 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) if not config.ignore_body_visibility: - raise Error(f"Body element is hidden: {visibility_info}") + raise Error(f"Body element is hidden: {visibility_info}") + + + # try: + # await page.wait_for_selector("body", state="attached", timeout=30000) + + # await page.wait_for_function( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return style.display !== 'none' && + # style.visibility !== 'hidden' && + # style.opacity !== '0'; + # } + # """, + # timeout=30000, + # ) + # except Error as e: + # visibility_info = await page.evaluate( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return { + # display: style.display, + # visibility: style.visibility, + # opacity: style.opacity, + # hasContent: body.innerHTML.length, + # classList: Array.from(body.classList) + # } + # } + # """ + # ) + + # if self.config.verbose: + # self.logger.debug( + # message="Body visibility info: {info}", + # tag="DEBUG", + # params={"info": visibility_info}, + # ) + + # if not config.ignore_body_visibility: + # raise Error(f"Body element is hidden: {visibility_info}") # Handle content loading and viewport adjustment if not self.browser_config.text_mode and ( @@ -998,23 +1302,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ): await page.wait_for_load_state("domcontentloaded") await asyncio.sleep(0.1) - try: - await page.wait_for_function( - "Array.from(document.images).every(img => img.complete)", - timeout=1000, + + # Check for image loading with improved error handling + images_loaded = await self.csp_compliant_wait( + page, + "() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)", + timeout=1000 + ) + + if not images_loaded and self.logger: + self.logger.warning( + message="Some images failed to load within timeout", + tag="SCRAPE", ) - except PlaywrightTimeoutError: - pass # Adjust viewport if needed if not self.browser_config.text_mode and config.adjust_viewport_to_content: try: - page_width = await page.evaluate( - "document.documentElement.scrollWidth" - ) - page_height = await page.evaluate( - "document.documentElement.scrollHeight" - ) + dimensions = await self.get_page_dimensions(page) + page_height = dimensions['height'] + page_width = dimensions['width'] + # page_width = await page.evaluate( + # "document.documentElement.scrollWidth" + # ) + # page_height = await page.evaluate( + # "document.documentElement.scrollHeight" + # ) target_width = self.browser_config.viewport_width target_height = int(target_width * page_width / page_height * 0.95) @@ -1046,12 +1359,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self._handle_full_page_scan(page, config.scroll_delay) # Execute JavaScript if provided + # if config.js_code: + # if isinstance(config.js_code, str): + # await page.evaluate(config.js_code) + # elif isinstance(config.js_code, list): + # for js in config.js_code: + # await page.evaluate(js) + if config.js_code: - if isinstance(config.js_code, str): - await page.evaluate(config.js_code) - elif isinstance(config.js_code, list): - for js in config.js_code: - await page.evaluate(js) + # execution_result = await self.execute_user_script(page, config.js_code) + execution_result = await self.robust_execute_user_script(page, config.js_code) + if not execution_result["success"]: + self.logger.warning( + message="User script execution had issues: {error}", + tag="JS_EXEC", + params={"error": execution_result.get("error")} + ) await self.execute_hook("on_execution_started", page, context=context) @@ -1154,30 +1477,52 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): raise e async def _handle_full_page_scan(self, page: Page, scroll_delay: float): - """Helper method to handle full page scanning""" + """ + Helper method to handle full page scanning. + + How it works: + 1. Get the viewport height. + 2. Scroll to the bottom of the page. + 3. Get the total height of the page. + 4. Scroll back to the top of the page. + 5. Scroll to the bottom of the page again. + 6. Continue scrolling until the bottom of the page is reached. + + Args: + page (Page): The Playwright page object + scroll_delay (float): The delay between page scrolls + + """ try: viewport_height = page.viewport_size.get( "height", self.browser_config.viewport_height ) current_position = viewport_height - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) - - total_height = await page.evaluate("document.documentElement.scrollHeight") + # await page.evaluate(f"window.scrollTo(0, {current_position})") + await self.safe_scroll(page, 0, current_position) + # await self.csp_scroll_to(page, 0, current_position) + # await asyncio.sleep(scroll_delay) + # total_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + total_height = dimensions['height'] + while current_position < total_height: current_position = min(current_position + viewport_height, total_height) - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) + await self.safe_scroll(page, 0, current_position) + # await page.evaluate(f"window.scrollTo(0, {current_position})") + # await asyncio.sleep(scroll_delay) - new_height = await page.evaluate( - "document.documentElement.scrollHeight" - ) + # new_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + new_height = dimensions['height'] + if new_height > total_height: total_height = new_height - await page.evaluate("window.scrollTo(0, 0)") + # await page.evaluate("window.scrollTo(0, 0)") + await self.safe_scroll(page, 0, 0) except Exception as e: self.logger.warning( @@ -1186,10 +1531,27 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": str(e)}, ) else: - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await self.safe_scroll(page, 0, total_height) async def _handle_download(self, download): - """Handle file downloads.""" + """ + Handle file downloads. + + How it works: + 1. Get the suggested filename. + 2. Get the download path. + 3. Log the download. + 4. Start the download. + 5. Save the downloaded file. + 6. Log the completion. + + Args: + download (Download): The Playwright download object + + Returns: + None + """ try: suggested_filename = download.suggested_filename download_path = os.path.join(self.downloads_path, suggested_filename) @@ -1221,21 +1583,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): params={"error": str(e)}, ) - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get("semaphore_count", 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.crawl(url, **kwargs) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [ - result if not isinstance(result, Exception) else str(result) - for result in results - ] - async def remove_overlay_elements(self, page: Page) -> None: """ Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. @@ -1246,7 +1593,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): remove_overlays_js = load_js_script("remove_overlay_elements") try: - await page.evaluate(remove_overlays_js) + await page.evaluate(f""" + (() => {{ + try {{ + {remove_overlays_js} + return {{ success: true }}; + }} catch (error) {{ + return {{ + success: false, + error: error.toString(), + stack: error.stack + }}; + }} + }})() + """) await page.wait_for_timeout(500) # Wait for any animations to complete except Exception as e: self.logger.warning( @@ -1258,12 +1618,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): async def export_pdf(self, page: Page) -> bytes: """ Exports the current page as a PDF. + + Args: + page (Page): The Playwright page object + + Returns: + bytes: The PDF data """ pdf_data = await page.pdf(print_background=True) return pdf_data async def take_screenshot(self, page, **kwargs) -> str: - page_height = await page.evaluate("document.documentElement.scrollHeight") + """ + Take a screenshot of the current page. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data + """ + dimensions = await self.get_page_dimensions(page) + page_height = dimensions['height'] if page_height < kwargs.get( "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD ): @@ -1276,8 +1653,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: """ - Convert the first page of the PDF to a screenshot. + Convert the first page of the PDF to a screenshot. + Requires pdf2image and poppler. + + Args: + pdf_data (bytes): The PDF data + + Returns: + str: The base64-encoded screenshot data """ try: from pdf2image import convert_from_bytes @@ -1307,11 +1691,23 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ Attempt to set a large viewport and take a full-page screenshot. If still too large, segment the page as before. + + Requires pdf2image and poppler. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data """ try: # Get page height - page_height = await page.evaluate("document.documentElement.scrollHeight") - page_width = await page.evaluate("document.documentElement.scrollWidth") + dimensions = await self.get_page_dimensions(page) + page_width = dimensions['width'] + page_height = dimensions['height'] + # page_height = await page.evaluate("document.documentElement.scrollHeight") + # page_width = await page.evaluate("document.documentElement.scrollWidth") # Set a large viewport large_viewport_height = min( @@ -1406,6 +1802,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ Exports the current storage state (cookies, localStorage, sessionStorage) to a JSON file at the specified path. + + Args: + path (str): The path to save the storage state JSON file + + Returns: + dict: The exported storage state """ if self.default_context: state = await self.default_context.storage_state(path=path) @@ -1421,39 +1823,339 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): tag="WARNING", ) - async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: + async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: """ - Generates a screenshot from raw HTML content. - - Args: - html (str): The HTML content to render and capture. - + Executes user-provided JavaScript code with proper error handling and context, + supporting both synchronous and async user code, plus navigations. + + How it works: + 1. Wait for load state 'domcontentloaded' + 2. If js_code is a string, execute it directly + 3. If js_code is a list, execute each element in sequence + 4. Wait for load state 'networkidle' + 5. Return results + + Args: + page (Page): The Playwright page instance + js_code (Union[str, List[str]]): The JavaScript code to execute + Returns: - Optional[str]: Base64-encoded screenshot image or an error image if failed. + Dict[str, Any]: The results of the execution """ try: - await self.start() - # Create a temporary page without a session_id - page, context = await self.browser_manager.get_page(None, self.user_agent) + await page.wait_for_load_state('domcontentloaded') + + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Attempt the evaluate + # If the user code triggers navigation, we catch the "context destroyed" error + # then wait for the new page to load before continuing + result = None + try: + result = await page.evaluate(f""" + (async () => {{ + try {{ + {script} + return {{ success: true }}; + }} catch (err) {{ + return {{ success: false, error: err.toString(), stack: err.stack }}; + }} + }})(); + """) + except Error as e: + # If it's due to navigation destroying the context, handle gracefully + if "Execution context was destroyed" in str(e): + self.logger.info("Navigation triggered by script, waiting for load state", tag="JS_EXEC") + try: + await page.wait_for_load_state('load', timeout=30000) + except Error as nav_err: + self.logger.warning( + message="Navigation wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)} + ) + try: + await page.wait_for_load_state('networkidle', timeout=30000) + except Error as nav_err: + self.logger.warning( + message="Network idle wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)} + ) + # Return partial success, or adapt as you see fit + result = { + "success": True, + "info": "Navigation triggered, ignoring context destroyed error" + } + else: + # It's some other error, log and continue + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + result = {"success": False, "error": str(e)} + + # If we made it this far with no repeated error, do post-load waits + t1 = time.time() + try: + await page.wait_for_load_state('domcontentloaded', timeout=5000) + print("DOM content loaded after script execution in", time.time() - t1) + except Error as e: + self.logger.warning( + message="DOM content load timeout: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + + # t1 = time.time() + # try: + # await page.wait_for_load_state('networkidle', timeout=5000) + # print("Network idle after script execution in", time.time() - t1) + # except Error as e: + # self.logger.warning( + # message="Network idle timeout: {error}", + # tag="JS_EXEC", + # params={"error": str(e)} + # ) - await page.set_content(html, wait_until="networkidle") - screenshot = await page.screenshot(full_page=True) - await page.close() - return base64.b64encode(screenshot).decode("utf-8") + results.append(result if result else {"success": True}) + + except Exception as e: + # Catch anything else + self.logger.error( + message="Script chunk failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" self.logger.error( - message="Screenshot failed: {error}", - tag="ERROR", - params={"error": error_message}, + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} ) + return {"success": False, "error": str(e)} - # Generate an error image - img = Image.new("RGB", (800, 600), color="black") - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: + """ + Executes user-provided JavaScript code with proper error handling and context. + + Args: + page: Playwright page object + js_code: Single JavaScript string or list of JavaScript code strings + + Returns: + Dict containing execution status and results/errors + """ + try: + # Ensure the page is ready for script execution + await page.wait_for_load_state('domcontentloaded') + + # Handle single script or multiple scripts + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Execute the script and wait for network idle + result = await page.evaluate(f""" + (() => {{ + return new Promise((resolve) => {{ + try {{ + const result = (function() {{ + {script} + }})(); + + // If result is a promise, wait for it + if (result instanceof Promise) {{ + result.then(() => {{ + // Wait a bit for any triggered effects + setTimeout(() => resolve({{ success: true }}), 100); + }}).catch(error => {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }}); + }} else {{ + // For non-promise results, still wait a bit for effects + setTimeout(() => resolve({{ success: true }}), 100); + }} + }} catch (error) {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }} + }}); + }})() + """) + + # Wait for network idle after script execution + t1 = time.time() + await page.wait_for_load_state('domcontentloaded', timeout=5000) + print("DOM content loaded after script execution in", time.time() - t1) - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode("utf-8") + t1 = time.time() + await page.wait_for_load_state('networkidle', timeout=5000) + print("Network idle after script execution in", time.time() - t1) + + results.append(result if result else {"success": True}) + + except Error as e: + # Handle Playwright-specific errors + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} + + async def check_visibility(self, page): + """ + Checks if an element is visible on the page. + + Args: + page: Playwright page object + + Returns: + Boolean indicating visibility + """ + return await page.evaluate(""" + () => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + } + """) + + async def safe_scroll(self, page: Page, x: int, y: int): + """ + Safely scroll the page with rendering time. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + """ + result = await self.csp_scroll_to(page, x, y) + if result['success']: + await page.wait_for_timeout(100) # Allow for rendering + return result + + async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: + """ + Performs a CSP-compliant scroll operation and returns the result status. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + + Returns: + Dict containing scroll status and position information + """ + try: + result = await page.evaluate( + f"""() => {{ + try {{ + const startX = window.scrollX; + const startY = window.scrollY; + window.scrollTo({x}, {y}); + + // Get final position after scroll + const endX = window.scrollX; + const endY = window.scrollY; + + return {{ + success: true, + startPosition: {{ x: startX, y: startY }}, + endPosition: {{ x: endX, y: endY }}, + targetPosition: {{ x: {x}, y: {y} }}, + delta: {{ + x: Math.abs(endX - {x}), + y: Math.abs(endY - {y}) + }} + }}; + }} catch (e) {{ + return {{ + success: false, + error: e.toString() + }}; + }} + }}""" + ) + + if not result['success']: + self.logger.warning( + message="Scroll operation failed: {error}", + tag="SCROLL", + params={"error": result.get('error')} + ) + + return result + + except Exception as e: + self.logger.error( + message="Failed to execute scroll: {error}", + tag="SCROLL", + params={"error": str(e)} + ) + return { + "success": False, + "error": str(e) + } + + async def get_page_dimensions(self, page: Page): + """ + Get the dimensions of the page. + + Args: + page: Playwright page object + + Returns: + Dict containing width and height of the page + """ + return await page.evaluate(""" + () => { + const {scrollWidth, scrollHeight} = document.documentElement; + return {width: scrollWidth, height: scrollHeight}; + } + """) \ No newline at end of file diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 5cdafac2..aed9c76b 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -7,7 +7,7 @@ from contextlib import asynccontextmanager import logging import json # Added for serialization/deserialization from .utils import ensure_content_dirs, generate_content_hash -from .models import CrawlResult +from .models import CrawlResult, MarkdownGenerationResult import xxhash import aiofiles from .config import NEED_MIGRATION @@ -295,13 +295,18 @@ class AsyncDatabaseManager: row_dict[field] = "" # Parse JSON fields - json_fields = ['media', 'links', 'metadata', 'response_headers'] + json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown'] for field in json_fields: try: row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {} except json.JSONDecodeError: row_dict[field] = {} + if isinstance(row_dict['markdown'], Dict): + row_dict['markdown_v2'] = row_dict['markdown'] + if row_dict['markdown'].get('raw_markdown'): + row_dict['markdown'] = row_dict['markdown']['raw_markdown'] + # Parse downloaded_files try: row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else [] @@ -331,10 +336,28 @@ class AsyncDatabaseManager: content_map = { 'html': (result.html, 'html'), 'cleaned_html': (result.cleaned_html or "", 'cleaned'), - 'markdown': (result.markdown or "", 'markdown'), + 'markdown': None, 'extracted_content': (result.extracted_content or "", 'extracted'), 'screenshot': (result.screenshot or "", 'screenshots') } + + try: + if isinstance(result.markdown, MarkdownGenerationResult): + content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown') + elif hasattr(result, 'markdown_v2'): + content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown') + elif isinstance(result.markdown, str): + markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown) + content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown') + else: + content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown') + except Exception as e: + self.logger.warning( + message=f"Error processing markdown content: {str(e)}", + tag="WARNING" + ) + # Fallback to empty markdown result + content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown') content_hashes = {} for field, (content, content_type) in content_map.items(): diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index dde6c2ce..f120b2c0 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -69,6 +69,24 @@ class AsyncWebCrawler: New way (recommended): browser_config = BrowserConfig(browser_type="chromium", headless=True) crawler = AsyncWebCrawler(config=browser_config) + + + Attributes: + browser_config (BrowserConfig): Configuration object for browser settings. + crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. + logger (AsyncLogger): Logger instance for recording events and errors. + always_bypass_cache (bool): Whether to always bypass cache. + crawl4ai_folder (str): Directory for storing cache. + base_directory (str): Base directory for storing cache. + ready (bool): Whether the crawler is ready for use. + + Methods: + start(): Start the crawler explicitly without using context manager. + close(): Close the crawler explicitly without using context manager. + arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). + awarmup(): Perform warmup sequence. + arun_many(): Run the crawler for multiple sources. + aprocess_html(): Process HTML content. """ _domain_last_hit = {} @@ -321,7 +339,7 @@ class AsyncWebCrawler: # Initialize processing variables async_response: AsyncCrawlResponse = None - cached_result = None + cached_result: CrawlResult = None screenshot_data = None pdf_data = None extracted_content = None @@ -373,52 +391,89 @@ class AsyncWebCrawler: tag="FETCH" ) - # Process the HTML content - crawl_result = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - config=config, # Pass the config object instead of individual parameters - screenshot=screenshot_data, - pdf_data=pdf_data, - verbose=config.verbose, - is_raw_html = True if url.startswith("raw:") else False, - **kwargs - ) + # Process the HTML content + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + config=config, # Pass the config object instead of individual parameters + screenshot=screenshot_data, + pdf_data=pdf_data, + verbose=config.verbose, + is_raw_html = True if url.startswith("raw:") else False, + **kwargs + ) + + # crawl_result.status_code = async_response.status_code + # crawl_result.response_headers = async_response.response_headers + # crawl_result.downloaded_files = async_response.downloaded_files + # crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate + # else: + # crawl_result.status_code = 200 + # crawl_result.response_headers = cached_result.response_headers if cached_result else {} + # crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache + + # # Check and set values from async_response to crawl_result + try: + for key in vars(async_response): + if hasattr(crawl_result, key): + value = getattr(async_response, key, None) + current_value = getattr(crawl_result, key, None) + if value is not None and not current_value: + try: + setattr(crawl_result, key, value) + except Exception as e: + self.logger.warning( + message=f"Failed to set attribute {key}: {str(e)}", + tag="WARNING" + ) + except Exception as e: + self.logger.warning( + message=f"Error copying response attributes: {str(e)}", + tag="WARNING" + ) + + crawl_result.success = bool(html) + crawl_result.session_id = getattr(config, 'session_id', None) + + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) + + return crawl_result - # Set response data - if async_response: - crawl_result.status_code = async_response.status_code - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files - crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate else: - crawl_result.status_code = 200 - crawl_result.response_headers = cached_result.response_headers if cached_result else {} - crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": True, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN, + "timing": Fore.YELLOW + } + ) - crawl_result.success = bool(html) - crawl_result.session_id = getattr(config, 'session_id', None) - - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": crawl_result.success, - "timing": f"{time.perf_counter() - start_time:.2f}s" - }, - colors={ - "status": Fore.GREEN if crawl_result.success else Fore.RED, - "timing": Fore.YELLOW - } - ) - - # Update cache if appropriate - if cache_context.should_write() and not bool(cached_result): - await async_db_manager.acache_url(crawl_result) - - return crawl_result + cached_result.success = bool(html) + cached_result.session_id = getattr(config, 'session_id', None) + return cached_result except Exception as e: error_context = get_error_context(sys.exc_info()) @@ -465,6 +520,7 @@ class AsyncWebCrawler: extracted_content: Previously extracted content (if any) config: Configuration object controlling processing behavior screenshot: Screenshot data (if any) + pdf_data: PDF data (if any) verbose: Whether to enable verbose logging **kwargs: Additional parameters for backwards compatibility diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py index 429eacc1..588edd62 100644 --- a/crawl4ai/cache_context.py +++ b/crawl4ai/cache_context.py @@ -25,8 +25,26 @@ class CacheContext: This class centralizes all cache-related logic and URL type checking, making the caching behavior more predictable and maintainable. + + Attributes: + url (str): The URL being processed. + cache_mode (CacheMode): The cache mode for the current operation. + always_bypass (bool): If True, bypasses caching for this operation. + is_cacheable (bool): True if the URL is cacheable, False otherwise. + is_web_url (bool): True if the URL is a web URL, False otherwise. + is_local_file (bool): True if the URL is a local file, False otherwise. + is_raw_html (bool): True if the URL is raw HTML, False otherwise. + _url_display (str): The display name for the URL (web, local file, or raw HTML). """ def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False): + """ + Initializes the CacheContext with the provided URL and cache mode. + + Args: + url (str): The URL being processed. + cache_mode (CacheMode): The cache mode for the current operation. + always_bypass (bool): If True, bypasses caching for this operation. + """ self.url = url self.cache_mode = cache_mode self.always_bypass = always_bypass @@ -37,13 +55,31 @@ class CacheContext: self._url_display = url if not self.is_raw_html else "Raw HTML" def should_read(self) -> bool: - """Determines if cache should be read based on context.""" + """ + Determines if cache should be read based on context. + + How it works: + 1. If always_bypass is True or is_cacheable is False, return False. + 2. If cache_mode is ENABLED or READ_ONLY, return True. + + Returns: + bool: True if cache should be read, False otherwise. + """ if self.always_bypass or not self.is_cacheable: return False return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY] def should_write(self) -> bool: - """Determines if cache should be written based on context.""" + """ + Determines if cache should be written based on context. + + How it works: + 1. If always_bypass is True or is_cacheable is False, return False. + 2. If cache_mode is ENABLED or WRITE_ONLY, return True. + + Returns: + bool: True if cache should be written, False otherwise. + """ if self.always_bypass or not self.is_cacheable: return False return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY] diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index 2af56b32..7b8c08ad 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -7,22 +7,43 @@ from .utils import * # Define the abstract base class for chunking strategies class ChunkingStrategy(ABC): + """ + Abstract base class for chunking strategies. + """ @abstractmethod def chunk(self, text: str) -> list: """ Abstract method to chunk the given text. + + Args: + text (str): The text to chunk. + + Returns: + list: A list of chunks. """ pass # Create an identity chunking strategy f(x) = [x] class IdentityChunking(ChunkingStrategy): + """ + Chunking strategy that returns the input text as a single chunk. + """ def chunk(self, text: str) -> list: return [text] # Regex-based chunking class RegexChunking(ChunkingStrategy): + """ + Chunking strategy that splits text based on regular expression patterns. + """ def __init__(self, patterns=None, **kwargs): + """ + Initialize the RegexChunking object. + + Args: + patterns (list): A list of regular expression patterns to split text. + """ if patterns is None: patterns = [r'\n\n'] # Default split pattern self.patterns = patterns @@ -38,9 +59,15 @@ class RegexChunking(ChunkingStrategy): # NLP-based sentence chunking class NlpSentenceChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into sentences using NLTK's sentence tokenizer. + """ def __init__(self, **kwargs): + """ + Initialize the NlpSentenceChunking object. + """ load_nltk_punkt() - pass + def chunk(self, text: str) -> list: # Improved regex for sentence splitting @@ -57,8 +84,21 @@ class NlpSentenceChunking(ChunkingStrategy): # Topic-based segmentation using TextTiling class TopicSegmentationChunking(ChunkingStrategy): + """ + Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer. + + How it works: + 1. Segment the text into topics using TextTilingTokenizer + 2. Extract keywords for each topic segment + """ def __init__(self, num_keywords=3, **kwargs): + """ + Initialize the TopicSegmentationChunking object. + + Args: + num_keywords (int): The number of keywords to extract for each topic segment. + """ import nltk as nl self.tokenizer = nl.tokenize.TextTilingTokenizer() self.num_keywords = num_keywords @@ -88,6 +128,14 @@ class TopicSegmentationChunking(ChunkingStrategy): # Fixed-length word chunks class FixedLengthWordChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into fixed-length word chunks. + + How it works: + 1. Split the text into words + 2. Create chunks of fixed length + 3. Return the list of chunks + """ def __init__(self, chunk_size=100, **kwargs): """ Initialize the fixed-length word chunking strategy with the given chunk size. @@ -103,6 +151,14 @@ class FixedLengthWordChunking(ChunkingStrategy): # Sliding window chunking class SlidingWindowChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into overlapping word chunks. + + How it works: + 1. Split the text into words + 2. Create chunks of fixed length + 3. Return the list of chunks + """ def __init__(self, window_size=100, step=50, **kwargs): """ Initialize the sliding window chunking strategy with the given window size and @@ -133,6 +189,15 @@ class SlidingWindowChunking(ChunkingStrategy): return chunks class OverlappingWindowChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into overlapping word chunks. + + How it works: + 1. Split the text into words using whitespace + 2. Create chunks of fixed length equal to the window size + 3. Slide the window by the overlap size + 4. Return the list of chunks + """ def __init__(self, window_size=1000, overlap=100, **kwargs): """ Initialize the overlapping window chunking strategy with the given window size and diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index f05b92fa..ab5ae517 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -9,17 +9,8 @@ from .utils import clean_tokens from abc import ABC, abstractmethod import math from snowballstemmer import stemmer - - -# import regex -# def tokenize_text(text): -# # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters -# pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]' -# return regex.findall(pattern, text) - -# from nltk.stem import PorterStemmer -# ps = PorterStemmer() class RelevantContentFilter(ABC): + """Abstract base class for content filtering strategies""" def __init__(self, user_query: str = None): self.user_query = user_query self.included_tags = { @@ -171,9 +162,8 @@ class RelevantContentFilter(ABC): chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold] return chunks - - def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: + def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: """Common method for extracting text chunks""" _text_cache = {} def fast_text(element: Tag) -> str: @@ -271,7 +261,38 @@ class RelevantContentFilter(ABC): return str(tag) # Fallback to original if anything fails class BM25ContentFilter(RelevantContentFilter): + """ + Content filtering using BM25 algorithm with priority tag handling. + + How it works: + 1. Extracts page metadata with fallbacks. + 2. Extracts text chunks from the body element. + 3. Tokenizes the corpus and query. + 4. Applies BM25 algorithm to calculate scores for each chunk. + 5. Filters out chunks below the threshold. + 6. Sorts chunks by score in descending order. + 7. Returns the top N chunks. + + Attributes: + user_query (str): User query for filtering (optional). + bm25_threshold (float): BM25 threshold for filtering (default: 1.0). + language (str): Language for stemming (default: 'english'). + + Methods: + filter_content(self, html: str, min_word_threshold: int = None) + """ def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'): + """ + Initializes the BM25ContentFilter class, if not provided, falls back to page metadata. + + Note: + If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph. + + Args: + user_query (str): User query for filtering (optional). + bm25_threshold (float): BM25 threshold for filtering (default: 1.0). + language (str): Language for stemming (default: 'english'). + """ super().__init__(user_query=user_query) self.bm25_threshold = bm25_threshold self.priority_tags = { @@ -290,7 +311,20 @@ class BM25ContentFilter(RelevantContentFilter): self.stemmer = stemmer(language) def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: - """Implements content filtering using BM25 algorithm with priority tag handling""" + """ + Implements content filtering using BM25 algorithm with priority tag handling. + + Note: + This method implements the filtering logic for the BM25ContentFilter class. + It takes HTML content as input and returns a list of filtered text chunks. + + Args: + html (str): HTML content to be filtered. + min_word_threshold (int): Minimum word threshold for filtering (optional). + + Returns: + List[str]: List of filtered text chunks. + """ if not html or not isinstance(html, str): return [] @@ -357,15 +391,42 @@ class BM25ContentFilter(RelevantContentFilter): return [self.clean_element(tag) for _, _, tag in selected_candidates] - - - - - class PruningContentFilter(RelevantContentFilter): + """ + Content filtering using pruning algorithm with dynamic threshold. + + How it works: + 1. Extracts page metadata with fallbacks. + 2. Extracts text chunks from the body element. + 3. Applies pruning algorithm to calculate scores for each chunk. + 4. Filters out chunks below the threshold. + 5. Sorts chunks by score in descending order. + 6. Returns the top N chunks. + + Attributes: + user_query (str): User query for filtering (optional), if not provided, falls back to page metadata. + min_word_threshold (int): Minimum word threshold for filtering (optional). + threshold_type (str): Threshold type for dynamic threshold (default: 'fixed'). + threshold (float): Fixed threshold value (default: 0.48). + + Methods: + filter_content(self, html: str, min_word_threshold: int = None): + """ def __init__(self, user_query: str = None, min_word_threshold: int = None, threshold_type: str = 'fixed', threshold: float = 0.48): - super().__init__(user_query) + """ + Initializes the PruningContentFilter class, if not provided, falls back to page metadata. + + Note: + If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph. + + Args: + user_query (str): User query for filtering (optional). + min_word_threshold (int): Minimum word threshold for filtering (optional). + threshold_type (str): Threshold type for dynamic threshold (default: 'fixed'). + threshold (float): Fixed threshold value (default: 0.48). + """ + super().__init__(None) self.min_word_threshold = min_word_threshold self.threshold_type = threshold_type self.threshold = threshold @@ -418,6 +479,20 @@ class PruningContentFilter(RelevantContentFilter): } def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: + """ + Implements content filtering using pruning algorithm with dynamic threshold. + + Note: + This method implements the filtering logic for the PruningContentFilter class. + It takes HTML content as input and returns a list of filtered text chunks. + + Args: + html (str): HTML content to be filtered. + min_word_threshold (int): Minimum word threshold for filtering (optional). + + Returns: + List[str]: List of filtered text chunks. + """ if not html or not isinstance(html, str): return [] @@ -444,15 +519,23 @@ class PruningContentFilter(RelevantContentFilter): return content_blocks def _remove_comments(self, soup): + """Removes HTML comments""" for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() def _remove_unwanted_tags(self, soup): + """Removes unwanted tags""" for tag in self.excluded_tags: for element in soup.find_all(tag): element.decompose() def _prune_tree(self, node): + """ + Prunes the tree starting from the given node. + + Args: + node (Tag): The node from which the pruning starts. + """ if not node or not hasattr(node, 'name') or node.name is None: return @@ -495,6 +578,7 @@ class PruningContentFilter(RelevantContentFilter): self._prune_tree(child) def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len): + """Computes the composite score""" if self.min_word_threshold: # Get raw text from metrics node - avoid extra processing text = metrics['node'].get_text(strip=True) @@ -531,6 +615,7 @@ class PruningContentFilter(RelevantContentFilter): return score / total_weight if total_weight > 0 else 0 def _compute_class_id_weight(self, node): + """Computes the class ID weight""" class_id_score = 0 if 'class' in node.attrs: classes = ' '.join(node['class']) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 35fdba1f..985ff592 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -64,6 +64,17 @@ class ContentScrapingStrategy(ABC): pass class WebScrapingStrategy(ContentScrapingStrategy): + """ + Class for web content scraping. Perhaps the most important class. + + How it works: + 1. Extract content from HTML using BeautifulSoup. + 2. Clean the extracted content using a content cleaning strategy. + 3. Filter the cleaned content using a content filtering strategy. + 4. Generate markdown content from the filtered content. + 5. Return the markdown content. + """ + def __init__(self, logger=None): self.logger = logger @@ -74,17 +85,57 @@ class WebScrapingStrategy(ContentScrapingStrategy): log_method(message=message, tag=tag, **kwargs) def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + """ + Main entry point for content scraping. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: + + - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. + - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. + - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. + - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + """ return self._scrap(url, html, is_async=False, **kwargs) async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + """ + Main entry point for asynchronous content scraping. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: + + - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. + - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. + - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. + - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) - def _generate_markdown_content(self, - cleaned_html: str, - html: str, - url: str, - success: bool, - **kwargs) -> Dict[str, Any]: + def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]: + """ + Generate markdown content from cleaned HTML. + + Args: + cleaned_html (str): The cleaned HTML content. + html (str): The original HTML content. + url (str): The URL of the page. + success (bool): Whether the content was successfully cleaned. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the generated markdown content. + """ markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) if markdown_generator: @@ -158,6 +209,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): """ def flatten_nested_elements(self, node): + """ + Flatten nested elements in a HTML tree. + + Args: + node (Tag): The root node of the HTML tree. + + Returns: + Tag: The flattened HTML tree. + """ if isinstance(node, NavigableString): return node if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name: @@ -166,6 +226,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): return node def find_closest_parent_with_useful_text(self, tag, **kwargs): + """ + Find the closest parent with useful text. + + Args: + tag (Tag): The starting tag to search from. + **kwargs: Additional keyword arguments. + + Returns: + Tag: The closest parent with useful text, or None if not found. + """ image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) current_tag = tag while current_tag: @@ -179,6 +249,17 @@ class WebScrapingStrategy(ContentScrapingStrategy): return None def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False): + """ + Remove unwanted attributes from an HTML element. + + Args: + element (Tag): The HTML element to remove attributes from. + important_attrs (list): List of important attributes to keep. + keep_data_attributes (bool): Whether to keep data attributes. + + Returns: + None + """ attrs_to_remove = [] for attr in element.attrs: if attr not in important_attrs: @@ -192,6 +273,26 @@ class WebScrapingStrategy(ContentScrapingStrategy): del element[attr] def process_image(self, img, url, index, total_images, **kwargs): + """ + Process an image element. + + How it works: + 1. Check if the image has valid display and inside undesired html elements. + 2. Score an image for it's usefulness. + 3. Extract image file metadata to extract size and extension. + 4. Generate a dictionary with the processed image information. + 5. Return the processed image information. + + Args: + img (Tag): The image element to process. + url (str): The URL of the page containing the image. + index (int): The index of the image in the list of images. + total_images (int): The total number of images in the list. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the processed image information. + """ parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') if ' ' in u else None} for u in [f"http{p}" for p in s.split("http") if p]] @@ -316,6 +417,23 @@ class WebScrapingStrategy(ContentScrapingStrategy): return image_variants if image_variants else None def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]: + """ + Process an HTML element. + + How it works: + 1. Check if the element is an image, video, or audio. + 2. Extract the element's attributes and content. + 3. Process the element based on its type. + 4. Return the processed element information. + + Args: + url (str): The URL of the page containing the element. + element (Tag): The HTML element to process. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the processed element information. + """ media = {'images': [], 'videos': [], 'audios': []} internal_links_dict = {} external_links_dict = {} @@ -334,6 +452,9 @@ class WebScrapingStrategy(ContentScrapingStrategy): } def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool: + """ + Process an HTML element. + """ try: if isinstance(element, NavigableString): if isinstance(element, Comment): @@ -534,11 +655,25 @@ class WebScrapingStrategy(ContentScrapingStrategy): return False def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: + """ + Extract content from HTML using BeautifulSoup. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page to scrape. + word_count_threshold (int): The minimum word count threshold for content extraction. + css_selector (str): The CSS selector to use for content extraction. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the extracted content. + """ success = True if not html: return None - soup = BeautifulSoup(html, 'lxml') + parser_type = kwargs.get('parser', 'lxml') + soup = BeautifulSoup(html, parser_type) body = soup.body base_domain = get_base_domain(url) diff --git a/crawl4ai/extraction_strategy.bak.py b/crawl4ai/extraction_strategy.bak.py new file mode 100644 index 00000000..2048c0ff --- /dev/null +++ b/crawl4ai/extraction_strategy.bak.py @@ -0,0 +1,1440 @@ +from abc import ABC, abstractmethod +from typing import Any, List, Dict, Optional, Union +from concurrent.futures import ThreadPoolExecutor, as_completed +import json, time +# from optimum.intel import IPEXModel +from .prompts import * +from .config import * +from .utils import * +from .models import * +from functools import partial +from .model_loader import * +import math +import numpy as np +import re +from bs4 import BeautifulSoup +from lxml import html, etree +from dataclasses import dataclass + +class ExtractionStrategy(ABC): + """ + Abstract base class for all extraction strategies. + """ + + def __init__(self, input_format: str = "markdown", **kwargs): + """ + Initialize the extraction strategy. + + Args: + input_format: Content format to use for extraction. + Options: "markdown" (default), "html", "fit_markdown" + **kwargs: Additional keyword arguments + """ + self.input_format = input_format + self.DEL = "<|DEL|>" + self.name = self.__class__.__name__ + self.verbose = kwargs.get("verbose", False) + + @abstractmethod + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + + :param url: The URL of the webpage. + :param html: The HTML content of the webpage. + :return: A list of extracted blocks or chunks. + """ + pass + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections of text in parallel by default. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to process. + :return: A list of processed JSON blocks. + """ + extracted_content = [] + with ThreadPoolExecutor() as executor: + futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections] + for future in as_completed(futures): + extracted_content.extend(future.result()) + return extracted_content + +class NoExtractionStrategy(ExtractionStrategy): + """ + A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block. + """ + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + """ + return [{"index": 0, "content": html}] + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] + +####################################################### +# Strategies using LLM-based extraction for text data # +####################################################### +class LLMExtractionStrategy(ExtractionStrategy): + """ + A strategy that uses an LLM to extract meaningful content from the HTML. + + Attributes: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + """ + + def __init__(self, + provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, + instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + + """ + super().__init__(**kwargs) + self.provider = provider + self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") + self.instruction = instruction + self.extract_type = extraction_type + self.schema = schema + if schema: + self.extract_type = "schema" + + self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) + self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) + self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.apply_chunking = kwargs.get("apply_chunking", True) + self.base_url = kwargs.get("base_url", None) + self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) + self.extra_args = kwargs.get("extra_args", {}) + if not self.apply_chunking: + self.chunk_token_threshold = 1e9 + + self.verbose = kwargs.get("verbose", False) + self.usages = [] # Store individual usages + self.total_usage = TokenUsage() # Accumulated usage + + if not self.api_token: + raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") + + + def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML using an LLM. + + How it works: + 1. Construct a prompt with variables. + 2. Make a request to the LLM using the prompt. + 3. Parse the response and extract blocks or chunks. + + Args: + url: The URL of the webpage. + ix: Index of the block. + html: The HTML content of the webpage. + + Returns: + A list of extracted blocks or chunks. + """ + if self.verbose: + # print("[LOG] Extracting blocks from URL:", url) + print(f"[LOG] Call LLM for {url} - block index: {ix}") + + variable_values = { + "URL": url, + "HTML": escape_json_string(sanitize_html(html)), + } + + prompt_with_variables = PROMPT_EXTRACT_BLOCKS + if self.instruction: + variable_values["REQUEST"] = self.instruction + prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION + + if self.extract_type == "schema" and self.schema: + variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) + prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION + + for variable in variable_values: + prompt_with_variables = prompt_with_variables.replace( + "{" + variable + "}", variable_values[variable] + ) + + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.api_base or self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} + ) + self.usages.append(usage) + + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + + try: + blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] + blocks = json.loads(blocks) + for block in blocks: + block['error'] = False + except Exception as e: + parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) + blocks = parsed + if unparsed: + blocks.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": unparsed + }) + + if self.verbose: + print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) + return blocks + + def _merge(self, documents, chunk_token_threshold, overlap): + """ + Merge documents into sections based on chunk_token_threshold and overlap. + """ + chunks = [] + sections = [] + total_tokens = 0 + + # Calculate the total tokens across all documents + for document in documents: + total_tokens += len(document.split(' ')) * self.word_token_rate + + # Calculate the number of sections needed + num_sections = math.floor(total_tokens / chunk_token_threshold) + if num_sections < 1: + num_sections = 1 # Ensure there is at least one section + adjusted_chunk_threshold = total_tokens / num_sections + + total_token_so_far = 0 + current_chunk = [] + + for document in documents: + tokens = document.split(' ') + token_count = len(tokens) * self.word_token_rate + + if total_token_so_far + token_count <= adjusted_chunk_threshold: + current_chunk.extend(tokens) + total_token_so_far += token_count + else: + # Ensure to handle the last section properly + if len(sections) == num_sections - 1: + current_chunk.extend(tokens) + continue + + # Add overlap if specified + if overlap > 0 and current_chunk: + overlap_tokens = current_chunk[-overlap:] + current_chunk.extend(overlap_tokens) + + sections.append(' '.join(current_chunk)) + current_chunk = tokens + total_token_so_far = token_count + + # Add the last chunk + if current_chunk: + sections.append(' '.join(current_chunk)) + + return sections + + + def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: + """ + Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. + + Args: + url: The URL of the webpage. + sections: List of sections (strings) to process. + + Returns: + A list of extracted blocks or chunks. + """ + + merged_sections = self._merge( + sections, self.chunk_token_threshold, + overlap= int(self.chunk_token_threshold * self.overlap_rate) + ) + extracted_content = [] + if self.provider.startswith("groq/"): + # Sequential processing with a delay + for ix, section in enumerate(merged_sections): + extract_func = partial(self.extract, url) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) + time.sleep(0.5) # 500 ms delay between each processing + else: + # Parallel processing using ThreadPoolExecutor + # extract_func = partial(self.extract, url) + # for ix, section in enumerate(merged_sections): + # extracted_content.append(extract_func(ix, section)) + + with ThreadPoolExecutor(max_workers=4) as executor: + extract_func = partial(self.extract, url) + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] + + for future in as_completed(futures): + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) + + + return extracted_content + + + def show_usage(self) -> None: + """Print a detailed token usage report showing total and per-request usage.""" + print("\n=== Token Usage Summary ===") + print(f"{'Type':<15} {'Count':>12}") + print("-" * 30) + print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") + print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") + print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") + + print("\n=== Usage History ===") + print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") + print("-" * 48) + for i, usage in enumerate(self.usages, 1): + print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") + +####################################################### +# Strategies using clustering for text data extraction # +####################################################### + +class CosineStrategy(ExtractionStrategy): + """ + Extract meaningful blocks or chunks from the given HTML using cosine similarity. + + How it works: + 1. Pre-filter documents using embeddings and semantic_filter. + 2. Perform clustering using cosine similarity. + 3. Organize texts by their cluster labels, retaining order. + 4. Filter clusters by word count. + 5. Extract meaningful blocks or chunks from the filtered clusters. + + Attributes: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + model_name (str): The name of the sentence-transformers model. + sim_threshold (float): The similarity threshold for clustering. + """ + def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + """ + super().__init__(**kwargs) + + import numpy as np + + self.semantic_filter = semantic_filter + self.word_count_threshold = word_count_threshold + self.max_dist = max_dist + self.linkage_method = linkage_method + self.top_k = top_k + self.sim_threshold = sim_threshold + self.timer = time.time() + self.verbose = kwargs.get("verbose", False) + + self.buffer_embeddings = np.array([]) + self.get_embedding_method = "direct" + + self.device = get_device() + # import torch + # self.device = torch.device('cpu') + + self.default_batch_size = calculate_batch_size(self.device) + + if self.verbose: + print(f"[LOG] Loading Extraction Model for {self.device.type} device.") + + # if False and self.device.type == "cpu": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + # else: + + self.tokenizer, self.model = load_HF_embedding_model(model_name) + self.model.to(self.device) + self.model.eval() + + self.get_embedding_method = "batch" + + self.buffer_embeddings = np.array([]) + + # if model_name == "bert-base-uncased": + # self.tokenizer, self.model = load_bert_base_uncased() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "BAAI/bge-small-en-v1.5": + # self.tokenizer, self.model = load_bge_small_en_v1_5() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "sentence-transformers/all-MiniLM-L6-v2": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + + + if self.verbose: + print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.") + + self.nlp, _ = load_text_multilabel_classifier() + # self.default_batch_size = 16 if self.device.type == 'cpu' else 64 + + if self.verbose: + print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds") + + def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]: + """ + Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding. + + Args: + documents (List[str]): A list of document texts. + semantic_filter (str): A keyword filter for document filtering. + at_least_k (int): The minimum number of documents to return. + + Returns: + List[str]: A list of filtered and sorted document texts. + """ + + if not semantic_filter: + return documents + + if len(documents) < at_least_k: + at_least_k = len(documents) // 2 + + from sklearn.metrics.pairwise import cosine_similarity + + # Compute embedding for the keyword filter + query_embedding = self.get_embeddings([semantic_filter])[0] + + # Compute embeddings for the documents + document_embeddings = self.get_embeddings(documents) + + # Calculate cosine similarity between the query embedding and document embeddings + similarities = cosine_similarity([query_embedding], document_embeddings).flatten() + + # Filter documents based on the similarity threshold + filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold] + + # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity + if len(filtered_docs) < at_least_k: + remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold] + remaining_docs.sort(key=lambda x: x[1], reverse=True) + filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)]) + + # Extract the document texts from the tuples + filtered_docs = [doc for doc, _ in filtered_docs] + + return filtered_docs[:at_least_k] + + def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False): + """ + Get BERT embeddings for a list of sentences. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of embeddings. + """ + # if self.buffer_embeddings.any() and not bypass_buffer: + # return self.buffer_embeddings + + if self.device.type in [ "cpu", "gpu", "cuda", "mps"]: + import torch + # Tokenize sentences and convert to tensor + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt') + encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()} + + # Ensure no gradients are calculated + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Get embeddings from the last hidden state (mean pooling) + embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy() + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + elif self.device.type == "cpu": + # self.buffer_embeddings = self.model(sentences) + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + embeddings = self.model(batch_sentences) + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + return self.buffer_embeddings + + def hierarchical_clustering(self, sentences: List[str], embeddings = None): + """ + Perform hierarchical clustering on sentences and return cluster labels. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of cluster labels. + """ + # Get embeddings + from scipy.cluster.hierarchy import linkage, fcluster + from scipy.spatial.distance import pdist + self.timer = time.time() + embeddings = self.get_embeddings(sentences, bypass_buffer=True) + # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds") + # Compute pairwise cosine distances + distance_matrix = pdist(embeddings, 'cosine') + # Perform agglomerative clustering respecting order + linked = linkage(distance_matrix, method=self.linkage_method) + # Form flat clusters + labels = fcluster(linked, self.max_dist, criterion='distance') + return labels + + def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]: + """ + Filter clusters to remove those with a word count below the threshold. + + Args: + clusters (Dict[int, List[str]]): Dictionary of clusters. + + Returns: + Dict[int, List[str]]: Filtered dictionary of clusters. + """ + filtered_clusters = {} + for cluster_id, texts in clusters.items(): + # Concatenate texts for analysis + full_text = " ".join(texts) + # Count words + word_count = len(full_text.split()) + + # Keep clusters with word count above the threshold + if word_count >= self.word_count_threshold: + filtered_clusters[cluster_id] = texts + + return filtered_clusters + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract clusters from HTML content using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + html (str): The HTML content of the webpage. + + Returns: + List[Dict[str, Any]]: A list of processed JSON blocks. + """ + # Assume `html` is a list of text chunks for this strategy + t = time.time() + text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed + + # Pre-filter documents using embeddings and semantic_filter + text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter) + + if not text_chunks: + return [] + + # Perform clustering + labels = self.hierarchical_clustering(text_chunks) + # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds") + + # Organize texts by their cluster labels, retaining order + t = time.time() + clusters = {} + for index, label in enumerate(labels): + clusters.setdefault(label, []).append(text_chunks[index]) + + # Filter clusters by word count + filtered_clusters = self.filter_clusters_by_word_count(clusters) + + # Convert filtered clusters to a sorted list of dictionaries + cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)] + + if self.verbose: + print(f"[LOG] 🚀 Assign tags using {self.device}") + + if self.device.type in ["gpu", "cuda", "mps", "cpu"]: + labels = self.nlp([cluster['content'] for cluster in cluster_list]) + + for cluster, label in zip(cluster_list, labels): + cluster['tags'] = label + # elif self.device.type == "cpu": + # # Process the text with the loaded model + # texts = [cluster['content'] for cluster in cluster_list] + # # Batch process texts + # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) + + # for doc, cluster in zip(docs, cluster_list): + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + # for cluster in cluster_list: + # doc = self.nlp(cluster['content']) + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + if self.verbose: + print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds") + + return cluster_list + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + sections (List[str]): List of sections (strings) to process. + + Returns: + """ + # This strategy processes all sections together + + return self.extract(url, self.DEL.join(sections), **kwargs) + +####################################################### +# New extraction strategies for JSON-based extraction # +####################################################### + +class JsonElementExtractionStrategy(ExtractionStrategy): + """ + Abstract base class for extracting structured JSON from HTML content. + + How it works: + 1. Parses HTML content using the `_parse_html` method. + 2. Uses a schema to define base selectors, fields, and transformations. + 3. Extracts data hierarchically, supporting nested fields and lists. + 4. Handles computed fields with expressions or functions. + + Attributes: + DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'. + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content. + _extract_item(element, fields): Extracts fields from a single element. + _extract_single_field(element, field): Extracts a single field based on its type. + _apply_transform(value, transform): Applies a transformation to a value. + _compute_field(item, field): Computes a field value using an expression or function. + run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy. + + Abstract Methods: + _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml). + _get_base_elements(parsed_html, selector): Retrieves base elements using a selector. + _get_elements(element, selector): Retrieves child elements using a selector. + _get_element_text(element): Extracts text content from an element. + _get_element_html(element): Extracts raw HTML from an element. + _get_element_attribute(element, attribute): Extracts an attribute's value from an element. + """ + + + DEL = '\n' + + def __init__(self, schema: Dict[str, Any], **kwargs): + """ + Initialize the JSON element extraction strategy with a schema. + + Args: + schema (Dict[str, Any]): The schema defining the extraction rules. + """ + super().__init__(**kwargs) + self.schema = schema + self.verbose = kwargs.get('verbose', False) + + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract structured data from HTML content. + + How it works: + 1. Parses the HTML content using the `_parse_html` method. + 2. Identifies base elements using the schema's base selector. + 3. Extracts fields from each base element using `_extract_item`. + + Args: + url (str): The URL of the page being processed. + html_content (str): The raw HTML content to parse and extract. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary. + """ + + parsed_html = self._parse_html(html_content) + base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector']) + + results = [] + for element in base_elements: + # Extract base element attributes + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + if item: + results.append(item) + + return results + + @abstractmethod + def _parse_html(self, html_content: str): + """Parse HTML content into appropriate format""" + pass + + @abstractmethod + def _get_base_elements(self, parsed_html, selector: str): + """Get all base elements using the selector""" + pass + + @abstractmethod + def _get_elements(self, element, selector: str): + """Get child elements using the selector""" + pass + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + nested_elements = self._get_elements(element, field['selector']) + nested_element = nested_elements[0] if nested_elements else None + return self._extract_item(nested_element, field['fields']) if nested_element else {} + + if field['type'] == 'list': + elements = self._get_elements(element, field['selector']) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + elements = self._get_elements(element, field['selector']) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_single_field(self, element, field): + """ + Extract a single field based on its type. + + How it works: + 1. Selects the target element using the field's selector. + 2. Extracts the field value based on its type (e.g., text, attribute, regex). + 3. Applies transformations if defined in the schema. + + Args: + element: The base element to extract the field from. + field (Dict[str, Any]): The field definition in the schema. + + Returns: + Any: The extracted field value. + """ + + if 'selector' in field: + selected = self._get_elements(element, field['selector']) + if not selected: + return field.get('default') + selected = selected[0] + else: + selected = element + + value = None + if field['type'] == 'text': + value = self._get_element_text(selected) + elif field['type'] == 'attribute': + value = self._get_element_attribute(selected, field['attribute']) + elif field['type'] == 'html': + value = self._get_element_html(selected) + elif field['type'] == 'regex': + text = self._get_element_text(selected) + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_item(self, element, fields): + """ + Extracts fields from a given element. + + How it works: + 1. Iterates through the fields defined in the schema. + 2. Handles computed, single, and nested field types. + 3. Updates the item dictionary with extracted field values. + + Args: + element: The base element to extract fields from. + fields (List[Dict[str, Any]]): The list of fields to extract. + + Returns: + Dict[str, Any]: A dictionary representing the extracted item. + """ + + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + """ + Apply a transformation to a value. + + How it works: + 1. Checks the transformation type (e.g., `lowercase`, `strip`). + 2. Applies the transformation to the value. + 3. Returns the transformed value. + + Args: + value (str): The value to transform. + transform (str): The type of transformation to apply. + + Returns: + str: The transformed value. + """ + + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Run the extraction strategy on a combined HTML content. + + How it works: + 1. Combines multiple HTML sections using the `DEL` delimiter. + 2. Calls the `extract` method with the combined HTML. + + Args: + url (str): The URL of the page being processed. + sections (List[str]): A list of HTML sections. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items. + """ + + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) + + @abstractmethod + def _get_element_text(self, element) -> str: + """Get text content from element""" + pass + + @abstractmethod + def _get_element_html(self, element) -> str: + """Get HTML content from element""" + pass + + @abstractmethod + def _get_element_attribute(self, element, attribute: str): + """Get attribute value from element""" + pass + +class JsonCssExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. + + How it works: + 1. Parses HTML content with BeautifulSoup. + 2. Selects elements using CSS selectors defined in the schema. + 3. Extracts field data and applies transformations as defined. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into a BeautifulSoup object. + _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector. + _get_elements(element, selector): Selects child elements using a CSS selector. + _get_element_text(element): Extracts text content from a BeautifulSoup element. + _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element. + _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return BeautifulSoup(html_content, 'html.parser') + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.select(selector) + + def _get_elements(self, element, selector: str): + selected = element.select_one(selector) + return [selected] if selected else [] + + def _get_element_text(self, element) -> str: + return element.get_text(strip=True) + + def _get_element_html(self, element) -> str: + return str(element) + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + +class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors. + + How it works: + 1. Parses HTML content into an lxml tree. + 2. Selects elements using XPath expressions. + 3. Converts CSS selectors to XPath when needed. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into an lxml tree. + _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector. + _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression. + _get_elements(element, selector): Selects child elements using an XPath selector. + _get_element_text(element): Extracts text content from an lxml element. + _get_element_html(element): Extracts the raw HTML content of an lxml element. + _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return html.fromstring(html_content) + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.xpath(selector) + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _get_elements(self, element, selector: str): + xpath = self._css_to_xpath(selector) + if not xpath.startswith('.'): + xpath = '.' + xpath + return element.xpath(xpath) + + def _get_element_text(self, element) -> str: + return ''.join(element.xpath('.//text()')).strip() + + def _get_element_html(self, element) -> str: + return etree.tostring(element, encoding='unicode') + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + + +####################################################### +# Strategies based on the extraction of specific types# +####################################################### + +class TopicExtractionStrategy(ExtractionStrategy): + def __init__(self, num_keywords: int = 3, **kwargs): + """ + Initialize the topic extraction strategy with parameters for topic segmentation. + + :param num_keywords: Number of keywords to represent each topic segment. + """ + import nltk + super().__init__(**kwargs) + self.num_keywords = num_keywords + self.tokenizer = nltk.TextTilingTokenizer() + + def extract_keywords(self, text: str) -> List[str]: + """ + Extract keywords from a given text segment using simple frequency analysis. + + :param text: The text segment from which to extract keywords. + :return: A list of keyword strings. + """ + import nltk + # Tokenize the text and compute word frequency + words = nltk.word_tokenize(text) + freq_dist = nltk.FreqDist(words) + # Get the most common words as keywords + keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)] + return keywords + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract topics from HTML content using TextTiling for segmentation and keyword extraction. + + :param url: The URL of the webpage. + :param html: The HTML content of the webpage. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of dictionaries representing the topics. + """ + # Use TextTiling to segment the text into topics + segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed + + # Prepare the output as a list of dictionaries + topic_list = [] + for i, segment in enumerate(segmented_topics): + # Extract keywords for each segment + keywords = self.extract_keywords(segment) + topic_list.append({ + "index": i, + "content": segment, + "keywords": keywords + }) + + return topic_list + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections using topic segmentation and keyword extraction. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to process. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of processed JSON blocks. + """ + # Concatenate sections into a single text for coherent topic segmentation + + + return self.extract(url, self.DEL.join(sections), **kwargs) + +class ContentSummarizationStrategy(ExtractionStrategy): + def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs): + """ + Initialize the content summarization strategy with a specific model. + + :param model_name: The model to use for summarization. + """ + super().__init__(**kwargs) + from transformers import pipeline + self.summarizer = pipeline("summarization", model=model_name) + + def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Summarize a single section of text. + + :param url: The URL of the webpage. + :param text: A section of text to summarize. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A dictionary with the summary. + """ + try: + summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False) + return {"summary": summary[0]['summary_text']} + except Exception as e: + print(f"Error summarizing text: {e}") + return {"summary": text} # Fallback to original text if summarization fails + + def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Process each section in parallel to produce summaries. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to summarize. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of dictionaries with summaries for each section. + """ + # Use a ThreadPoolExecutor to summarize in parallel + summaries = [] + with ThreadPoolExecutor() as executor: + # Create a future for each section's summarization + future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)} + for future in as_completed(future_to_section): + section_index = future_to_section[future] + try: + summary_result = future.result() + summaries.append((section_index, summary_result)) + except Exception as e: + print(f"Error processing section {section_index}: {e}") + summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text + + # Sort summaries by the original section index to maintain order + summaries.sort(key=lambda x: x[0]) + return [summary for _, summary in summaries] + +####################################################### +# Deprecated strategies +####################################################### + +class _JsonCssExtractionStrategy(ExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(**kwargs) + self.schema = schema + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + soup = BeautifulSoup(html, 'html.parser') + base_elements = soup.select(self.schema['baseSelector']) + + results = [] + for element in base_elements: + # Extract base element attributes first + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Then extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + results.append(item) + + return results + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + nested_element = element.select_one(field['selector']) + return self._extract_item(nested_element, field['fields']) if nested_element else {} + + if field['type'] == 'list': + elements = element.select(field['selector']) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + elements = element.select(field['selector']) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_single_field(self, element, field): + if 'selector' in field: + selected = element.select_one(field['selector']) + if not selected: + return field.get('default') + else: + selected = element + + value = None + if field['type'] == 'text': + value = selected.get_text(strip=True) + elif field['type'] == 'attribute': + value = selected.get(field['attribute']) + elif field['type'] == 'html': + value = str(selected) + elif field['type'] == 'regex': + text = selected.get_text(strip=True) + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_item(self, element, fields): + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) +class _JsonXPathExtractionStrategy(ExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(**kwargs) + self.schema = schema + + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + tree = html.fromstring(html_content) + base_xpath = self.schema['baseSelector'] + base_elements = tree.xpath(base_xpath) + + results = [] + for element in base_elements: + # Extract base element attributes first + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Then extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + results.append(item) + + return results + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + else: + # Fallback to basic conversion for common cases + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + # Handle basic cases + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + xpath = self._css_to_xpath(field['selector']) + nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None + return self._extract_item(nested_element, field['fields']) if nested_element is not None else {} + + if field['type'] == 'list': + xpath = self._css_to_xpath(field['selector']) + elements = element.xpath(xpath) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + xpath = self._css_to_xpath(field['selector']) + elements = element.xpath(xpath) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_single_field(self, element, field): + if 'selector' in field: + xpath = self._css_to_xpath(field['selector']) + selected = element.xpath(xpath) + if not selected: + return field.get('default') + selected = selected[0] + else: + selected = element + + value = None + if field['type'] == 'text': + value = ''.join(selected.xpath('.//text()')).strip() + elif field['type'] == 'attribute': + value = selected.get(field['attribute']) + elif field['type'] == 'html': + value = etree.tostring(selected, encoding='unicode') + elif field['type'] == 'regex': + text = ''.join(selected.xpath('.//text()')).strip() + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_item(self, element, fields): + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index c2ba891e..7441e32d 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -62,29 +62,66 @@ class ExtractionStrategy(ABC): return extracted_content class NoExtractionStrategy(ExtractionStrategy): + """ + A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block. + """ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + """ return [{"index": 0, "content": html}] def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] - ####################################################### # Strategies using LLM-based extraction for text data # ####################################################### - - - class LLMExtractionStrategy(ExtractionStrategy): + """ + A strategy that uses an LLM to extract meaningful content from the HTML. + + Attributes: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + """ + def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): """ Initialize the strategy with clustering parameters. + + Args: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. - :param provider: The provider to use for extraction. - :param api_token: The API token for the provider. - :param instruction: The instruction to use for the LLM model. """ super().__init__(**kwargs) self.provider = provider @@ -114,6 +151,22 @@ class LLMExtractionStrategy(ExtractionStrategy): def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML using an LLM. + + How it works: + 1. Construct a prompt with variables. + 2. Make a request to the LLM using the prompt. + 3. Parse the response and extract blocks or chunks. + + Args: + url: The URL of the webpage. + ix: Index of the block. + html: The HTML content of the webpage. + + Returns: + A list of extracted blocks or chunks. + """ if self.verbose: # print("[LOG] Extracting blocks from URL:", url) print(f"[LOG] Call LLM for {url} - block index: {ix}") @@ -180,6 +233,9 @@ class LLMExtractionStrategy(ExtractionStrategy): return blocks def _merge(self, documents, chunk_token_threshold, overlap): + """ + Merge documents into sections based on chunk_token_threshold and overlap. + """ chunks = [] sections = [] total_tokens = 0 @@ -229,6 +285,13 @@ class LLMExtractionStrategy(ExtractionStrategy): def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: """ Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. + + Args: + url: The URL of the webpage. + sections: List of sections (strings) to process. + + Returns: + A list of extracted blocks or chunks. """ merged_sections = self._merge( @@ -285,12 +348,30 @@ class LLMExtractionStrategy(ExtractionStrategy): for i, usage in enumerate(self.usages, 1): print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") - ####################################################### # Strategies using clustering for text data extraction # ####################################################### class CosineStrategy(ExtractionStrategy): + """ + Extract meaningful blocks or chunks from the given HTML using cosine similarity. + + How it works: + 1. Pre-filter documents using embeddings and semantic_filter. + 2. Perform clustering using cosine similarity. + 3. Organize texts by their cluster labels, retaining order. + 4. Filter clusters by word count. + 5. Extract meaningful blocks or chunks from the filtered clusters. + + Attributes: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + model_name (str): The name of the sentence-transformers model. + sim_threshold (float): The similarity threshold for clustering. + """ def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): """ Initialize the strategy with clustering parameters. @@ -368,11 +449,13 @@ class CosineStrategy(ExtractionStrategy): """ Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding. - :param documents: List of text chunks (documents). - :param semantic_filter: A string containing the keywords for filtering. - :param threshold: Cosine similarity threshold for filtering documents. - :param at_least_k: Minimum number of documents to return. - :return: List of filtered documents, ensuring at least `at_least_k` documents. + Args: + documents (List[str]): A list of document texts. + semantic_filter (str): A keyword filter for document filtering. + at_least_k (int): The minimum number of documents to return. + + Returns: + List[str]: A list of filtered and sorted document texts. """ if not semantic_filter: @@ -410,8 +493,11 @@ class CosineStrategy(ExtractionStrategy): """ Get BERT embeddings for a list of sentences. - :param sentences: List of text chunks (sentences). - :return: NumPy array of embeddings. + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of embeddings. """ # if self.buffer_embeddings.any() and not bypass_buffer: # return self.buffer_embeddings @@ -455,8 +541,11 @@ class CosineStrategy(ExtractionStrategy): """ Perform hierarchical clustering on sentences and return cluster labels. - :param sentences: List of text chunks (sentences). - :return: NumPy array of cluster labels. + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of cluster labels. """ # Get embeddings from scipy.cluster.hierarchy import linkage, fcluster @@ -472,12 +561,15 @@ class CosineStrategy(ExtractionStrategy): labels = fcluster(linked, self.max_dist, criterion='distance') return labels - def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]): + def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]: """ Filter clusters to remove those with a word count below the threshold. - :param clusters: Dictionary of clusters. - :return: Filtered dictionary of clusters. + Args: + clusters (Dict[int, List[str]]): Dictionary of clusters. + + Returns: + Dict[int, List[str]]: Filtered dictionary of clusters. """ filtered_clusters = {} for cluster_id, texts in clusters.items(): @@ -496,9 +588,12 @@ class CosineStrategy(ExtractionStrategy): """ Extract clusters from HTML content using hierarchical clustering. - :param url: The URL of the webpage. - :param html: The HTML content of the webpage. - :return: A list of dictionaries representing the clusters. + Args: + url (str): The URL of the webpage. + html (str): The HTML content of the webpage. + + Returns: + List[Dict[str, Any]]: A list of processed JSON blocks. """ # Assume `html` is a list of text chunks for this strategy t = time.time() @@ -560,159 +655,85 @@ class CosineStrategy(ExtractionStrategy): """ Process sections using hierarchical clustering. - :param url: The URL of the webpage. - :param sections: List of sections (strings) to process. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of processed JSON blocks. + Args: + url (str): The URL of the webpage. + sections (List[str]): List of sections (strings) to process. + + Returns: """ # This strategy processes all sections together return self.extract(url, self.DEL.join(sections), **kwargs) - -####################################################### -# Strategies based on the extraction of specific types # -####################################################### - -class TopicExtractionStrategy(ExtractionStrategy): - def __init__(self, num_keywords: int = 3, **kwargs): - """ - Initialize the topic extraction strategy with parameters for topic segmentation. - - :param num_keywords: Number of keywords to represent each topic segment. - """ - import nltk - super().__init__(**kwargs) - self.num_keywords = num_keywords - self.tokenizer = nltk.TextTilingTokenizer() - - def extract_keywords(self, text: str) -> List[str]: - """ - Extract keywords from a given text segment using simple frequency analysis. - - :param text: The text segment from which to extract keywords. - :return: A list of keyword strings. - """ - import nltk - # Tokenize the text and compute word frequency - words = nltk.word_tokenize(text) - freq_dist = nltk.FreqDist(words) - # Get the most common words as keywords - keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)] - return keywords - - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: - """ - Extract topics from HTML content using TextTiling for segmentation and keyword extraction. - - :param url: The URL of the webpage. - :param html: The HTML content of the webpage. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of dictionaries representing the topics. - """ - # Use TextTiling to segment the text into topics - segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed - - # Prepare the output as a list of dictionaries - topic_list = [] - for i, segment in enumerate(segmented_topics): - # Extract keywords for each segment - keywords = self.extract_keywords(segment) - topic_list.append({ - "index": i, - "content": segment, - "keywords": keywords - }) - - return topic_list - - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: - """ - Process sections using topic segmentation and keyword extraction. - - :param url: The URL of the webpage. - :param sections: List of sections (strings) to process. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of processed JSON blocks. - """ - # Concatenate sections into a single text for coherent topic segmentation - - - return self.extract(url, self.DEL.join(sections), **kwargs) - -class ContentSummarizationStrategy(ExtractionStrategy): - def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs): - """ - Initialize the content summarization strategy with a specific model. - - :param model_name: The model to use for summarization. - """ - super().__init__(**kwargs) - from transformers import pipeline - self.summarizer = pipeline("summarization", model=model_name) - - def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: - """ - Summarize a single section of text. - - :param url: The URL of the webpage. - :param text: A section of text to summarize. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A dictionary with the summary. - """ - try: - summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False) - return {"summary": summary[0]['summary_text']} - except Exception as e: - print(f"Error summarizing text: {e}") - return {"summary": text} # Fallback to original text if summarization fails - - def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: - """ - Process each section in parallel to produce summaries. - - :param url: The URL of the webpage. - :param sections: List of sections (strings) to summarize. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of dictionaries with summaries for each section. - """ - # Use a ThreadPoolExecutor to summarize in parallel - summaries = [] - with ThreadPoolExecutor() as executor: - # Create a future for each section's summarization - future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)} - for future in as_completed(future_to_section): - section_index = future_to_section[future] - try: - summary_result = future.result() - summaries.append((section_index, summary_result)) - except Exception as e: - print(f"Error processing section {section_index}: {e}") - summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text - - # Sort summaries by the original section index to maintain order - summaries.sort(key=lambda x: x[0]) - return [summary for _, summary in summaries] - - ####################################################### # New extraction strategies for JSON-based extraction # ####################################################### class JsonElementExtractionStrategy(ExtractionStrategy): + """ + Abstract base class for extracting structured JSON from HTML content. + + How it works: + 1. Parses HTML content using the `_parse_html` method. + 2. Uses a schema to define base selectors, fields, and transformations. + 3. Extracts data hierarchically, supporting nested fields and lists. + 4. Handles computed fields with expressions or functions. + + Attributes: + DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'. + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content. + _extract_item(element, fields): Extracts fields from a single element. + _extract_single_field(element, field): Extracts a single field based on its type. + _apply_transform(value, transform): Applies a transformation to a value. + _compute_field(item, field): Computes a field value using an expression or function. + run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy. + + Abstract Methods: + _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml). + _get_base_elements(parsed_html, selector): Retrieves base elements using a selector. + _get_elements(element, selector): Retrieves child elements using a selector. + _get_element_text(element): Extracts text content from an element. + _get_element_html(element): Extracts raw HTML from an element. + _get_element_attribute(element, attribute): Extracts an attribute's value from an element. + """ + + DEL = '\n' def __init__(self, schema: Dict[str, Any], **kwargs): + """ + Initialize the JSON element extraction strategy with a schema. + + Args: + schema (Dict[str, Any]): The schema defining the extraction rules. + """ super().__init__(**kwargs) self.schema = schema self.verbose = kwargs.get('verbose', False) def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract structured data from HTML content. + + How it works: + 1. Parses the HTML content using the `_parse_html` method. + 2. Identifies base elements using the schema's base selector. + 3. Extracts fields from each base element using `_extract_item`. + + Args: + url (str): The URL of the page being processed. + html_content (str): The raw HTML content to parse and extract. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary. + """ + parsed_html = self._parse_html(html_content) base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector']) @@ -772,6 +793,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy): return field.get('default') def _extract_single_field(self, element, field): + """ + Extract a single field based on its type. + + How it works: + 1. Selects the target element using the field's selector. + 2. Extracts the field value based on its type (e.g., text, attribute, regex). + 3. Applies transformations if defined in the schema. + + Args: + element: The base element to extract the field from. + field (Dict[str, Any]): The field definition in the schema. + + Returns: + Any: The extracted field value. + """ + if 'selector' in field: selected = self._get_elements(element, field['selector']) if not selected: @@ -806,6 +843,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy): return item def _extract_item(self, element, fields): + """ + Extracts fields from a given element. + + How it works: + 1. Iterates through the fields defined in the schema. + 2. Handles computed, single, and nested field types. + 3. Updates the item dictionary with extracted field values. + + Args: + element: The base element to extract fields from. + fields (List[Dict[str, Any]]): The list of fields to extract. + + Returns: + Dict[str, Any]: A dictionary representing the extracted item. + """ + item = {} for field in fields: if field['type'] == 'computed': @@ -817,6 +870,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy): return item def _apply_transform(self, value, transform): + """ + Apply a transformation to a value. + + How it works: + 1. Checks the transformation type (e.g., `lowercase`, `strip`). + 2. Applies the transformation to the value. + 3. Returns the transformed value. + + Args: + value (str): The value to transform. + transform (str): The type of transformation to apply. + + Returns: + str: The transformed value. + """ + if transform == 'lowercase': return value.lower() elif transform == 'uppercase': @@ -837,6 +906,23 @@ class JsonElementExtractionStrategy(ExtractionStrategy): return field.get('default') def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Run the extraction strategy on a combined HTML content. + + How it works: + 1. Combines multiple HTML sections using the `DEL` delimiter. + 2. Calls the `extract` method with the combined HTML. + + Args: + url (str): The URL of the page being processed. + sections (List[str]): A list of HTML sections. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items. + """ + combined_html = self.DEL.join(sections) return self.extract(url, combined_html, **kwargs) @@ -856,6 +942,27 @@ class JsonElementExtractionStrategy(ExtractionStrategy): pass class JsonCssExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. + + How it works: + 1. Parses HTML content with BeautifulSoup. + 2. Selects elements using CSS selectors defined in the schema. + 3. Extracts field data and applies transformations as defined. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into a BeautifulSoup object. + _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector. + _get_elements(element, selector): Selects child elements using a CSS selector. + _get_element_text(element): Extracts text content from a BeautifulSoup element. + _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element. + _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element. + """ + def __init__(self, schema: Dict[str, Any], **kwargs): kwargs['input_format'] = 'html' # Force HTML input super().__init__(schema, **kwargs) @@ -880,6 +987,28 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): return element.get(attribute) class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors. + + How it works: + 1. Parses HTML content into an lxml tree. + 2. Selects elements using XPath expressions. + 3. Converts CSS selectors to XPath when needed. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into an lxml tree. + _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector. + _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression. + _get_elements(element, selector): Selects child elements using an XPath selector. + _get_element_text(element): Extracts text content from an lxml element. + _get_element_html(element): Extracts the raw HTML content of an lxml element. + _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element. + """ + def __init__(self, schema: Dict[str, Any], **kwargs): kwargs['input_format'] = 'html' # Force HTML input super().__init__(schema, **kwargs) @@ -921,259 +1050,3 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): def _get_element_attribute(self, element, attribute: str): return element.get(attribute) - -class _JsonCssExtractionStrategy(ExtractionStrategy): - def __init__(self, schema: Dict[str, Any], **kwargs): - kwargs['input_format'] = 'html' # Force HTML input - super().__init__(**kwargs) - self.schema = schema - - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: - soup = BeautifulSoup(html, 'html.parser') - base_elements = soup.select(self.schema['baseSelector']) - - results = [] - for element in base_elements: - # Extract base element attributes first - item = {} - if 'baseFields' in self.schema: - for field in self.schema['baseFields']: - value = self._extract_single_field(element, field) - if value is not None: - item[field['name']] = value - - # Then extract child fields - field_data = self._extract_item(element, self.schema['fields']) - item.update(field_data) - - results.append(item) - - return results - - def _extract_field(self, element, field): - try: - if field['type'] == 'nested': - nested_element = element.select_one(field['selector']) - return self._extract_item(nested_element, field['fields']) if nested_element else {} - - if field['type'] == 'list': - elements = element.select(field['selector']) - return [self._extract_list_item(el, field['fields']) for el in elements] - - if field['type'] == 'nested_list': - elements = element.select(field['selector']) - return [self._extract_item(el, field['fields']) for el in elements] - - return self._extract_single_field(element, field) - except Exception as e: - if self.verbose: - print(f"Error extracting field {field['name']}: {str(e)}") - return field.get('default') - - def _extract_list_item(self, element, fields): - item = {} - for field in fields: - value = self._extract_single_field(element, field) - if value is not None: - item[field['name']] = value - return item - - def _extract_single_field(self, element, field): - if 'selector' in field: - selected = element.select_one(field['selector']) - if not selected: - return field.get('default') - else: - selected = element - - value = None - if field['type'] == 'text': - value = selected.get_text(strip=True) - elif field['type'] == 'attribute': - value = selected.get(field['attribute']) - elif field['type'] == 'html': - value = str(selected) - elif field['type'] == 'regex': - text = selected.get_text(strip=True) - match = re.search(field['pattern'], text) - value = match.group(1) if match else None - - if 'transform' in field: - value = self._apply_transform(value, field['transform']) - - return value if value is not None else field.get('default') - - def _extract_item(self, element, fields): - item = {} - for field in fields: - if field['type'] == 'computed': - value = self._compute_field(item, field) - else: - value = self._extract_field(element, field) - if value is not None: - item[field['name']] = value - return item - - def _apply_transform(self, value, transform): - if transform == 'lowercase': - return value.lower() - elif transform == 'uppercase': - return value.upper() - elif transform == 'strip': - return value.strip() - return value - - def _compute_field(self, item, field): - try: - if 'expression' in field: - return eval(field['expression'], {}, item) - elif 'function' in field: - return field['function'](item) - except Exception as e: - if self.verbose: - print(f"Error computing field {field['name']}: {str(e)}") - return field.get('default') - - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: - combined_html = self.DEL.join(sections) - return self.extract(url, combined_html, **kwargs) -class _JsonXPathExtractionStrategy(ExtractionStrategy): - def __init__(self, schema: Dict[str, Any], **kwargs): - kwargs['input_format'] = 'html' # Force HTML input - super().__init__(**kwargs) - self.schema = schema - - def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: - tree = html.fromstring(html_content) - base_xpath = self.schema['baseSelector'] - base_elements = tree.xpath(base_xpath) - - results = [] - for element in base_elements: - # Extract base element attributes first - item = {} - if 'baseFields' in self.schema: - for field in self.schema['baseFields']: - value = self._extract_single_field(element, field) - if value is not None: - item[field['name']] = value - - # Then extract child fields - field_data = self._extract_item(element, self.schema['fields']) - item.update(field_data) - - results.append(item) - - return results - - def _css_to_xpath(self, css_selector: str) -> str: - """Convert CSS selector to XPath if needed""" - if '/' in css_selector: # Already an XPath - return css_selector - else: - # Fallback to basic conversion for common cases - return self._basic_css_to_xpath(css_selector) - - def _basic_css_to_xpath(self, css_selector: str) -> str: - """Basic CSS to XPath conversion for common cases""" - # Handle basic cases - if ' > ' in css_selector: - parts = css_selector.split(' > ') - return '//' + '/'.join(parts) - if ' ' in css_selector: - parts = css_selector.split(' ') - return '//' + '//'.join(parts) - return '//' + css_selector - - def _extract_field(self, element, field): - try: - if field['type'] == 'nested': - xpath = self._css_to_xpath(field['selector']) - nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None - return self._extract_item(nested_element, field['fields']) if nested_element is not None else {} - - if field['type'] == 'list': - xpath = self._css_to_xpath(field['selector']) - elements = element.xpath(xpath) - return [self._extract_list_item(el, field['fields']) for el in elements] - - if field['type'] == 'nested_list': - xpath = self._css_to_xpath(field['selector']) - elements = element.xpath(xpath) - return [self._extract_item(el, field['fields']) for el in elements] - - return self._extract_single_field(element, field) - except Exception as e: - if self.verbose: - print(f"Error extracting field {field['name']}: {str(e)}") - return field.get('default') - - def _extract_list_item(self, element, fields): - item = {} - for field in fields: - value = self._extract_single_field(element, field) - if value is not None: - item[field['name']] = value - return item - - def _extract_single_field(self, element, field): - if 'selector' in field: - xpath = self._css_to_xpath(field['selector']) - selected = element.xpath(xpath) - if not selected: - return field.get('default') - selected = selected[0] - else: - selected = element - - value = None - if field['type'] == 'text': - value = ''.join(selected.xpath('.//text()')).strip() - elif field['type'] == 'attribute': - value = selected.get(field['attribute']) - elif field['type'] == 'html': - value = etree.tostring(selected, encoding='unicode') - elif field['type'] == 'regex': - text = ''.join(selected.xpath('.//text()')).strip() - match = re.search(field['pattern'], text) - value = match.group(1) if match else None - - if 'transform' in field: - value = self._apply_transform(value, field['transform']) - - return value if value is not None else field.get('default') - - def _extract_item(self, element, fields): - item = {} - for field in fields: - if field['type'] == 'computed': - value = self._compute_field(item, field) - else: - value = self._extract_field(element, field) - if value is not None: - item[field['name']] = value - return item - - def _apply_transform(self, value, transform): - if transform == 'lowercase': - return value.lower() - elif transform == 'uppercase': - return value.upper() - elif transform == 'strip': - return value.strip() - return value - - def _compute_field(self, item, field): - try: - if 'expression' in field: - return eval(field['expression'], {}, item) - elif 'function' in field: - return field['function'](item) - except Exception as e: - if self.verbose: - print(f"Error computing field {field['name']}: {str(e)}") - return field.get('default') - - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: - combined_html = self.DEL.join(sections) - return self.extract(url, combined_html, **kwargs) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index b9e4b0c6..474dc9e8 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -38,11 +38,44 @@ class MarkdownGenerationStrategy(ABC): pass class DefaultMarkdownGenerator(MarkdownGenerationStrategy): - """Default implementation of markdown generation strategy.""" + """ + Default implementation of markdown generation strategy. + + How it works: + 1. Generate raw markdown from cleaned HTML. + 2. Convert links to citations. + 3. Generate fit markdown if content filter is provided. + 4. Return MarkdownGenerationResult. + + Args: + content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. + options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. + + Returns: + MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. + """ def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): super().__init__(content_filter, options) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: + """ + Convert links in markdown to citations. + + How it works: + 1. Find all links in the markdown. + 2. Convert links to citations. + 3. Return converted markdown and references markdown. + + Note: + This function uses a regex pattern to find links in markdown. + + Args: + markdown (str): Markdown text. + base_url (str): Base URL for URL joins. + + Returns: + Tuple[str, str]: Converted markdown and references markdown. + """ link_map = {} url_cache = {} # Cache for URL joins parts = [] @@ -90,7 +123,26 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult: - """Generate markdown with citations from cleaned HTML.""" + """ + Generate markdown with citations from cleaned HTML. + + How it works: + 1. Generate raw markdown from cleaned HTML. + 2. Convert links to citations. + 3. Generate fit markdown if content filter is provided. + 4. Return MarkdownGenerationResult. + + Args: + cleaned_html (str): Cleaned HTML content. + base_url (str): Base URL for URL joins. + html2text_options (Optional[Dict[str, Any]]): HTML2Text options. + options (Optional[Dict[str, Any]]): Additional options for markdown generation. + content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. + citations (bool): Whether to generate citations. + + Returns: + MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. + """ # Initialize HTML2Text with options h = CustomHTML2Text() if html2text_options: diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py index f6e76823..97529e3e 100644 --- a/crawl4ai/ssl_certificate.py +++ b/crawl4ai/ssl_certificate.py @@ -13,13 +13,34 @@ from pathlib import Path class SSLCertificate: """ A class representing an SSL certificate with methods to export in various formats. + + Attributes: + cert_info (Dict[str, Any]): The certificate information. + + Methods: + from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL. + from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file. + from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data. + export_as_pem() -> str: Export the certificate as PEM format. + export_as_der() -> bytes: Export the certificate as DER format. + export_as_json() -> Dict[str, Any]: Export the certificate as JSON format. + export_as_text() -> str: Export the certificate as text format. """ def __init__(self, cert_info: Dict[str, Any]): self._cert_info = self._decode_cert_data(cert_info) @staticmethod def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: - """Create SSLCertificate instance from a URL.""" + """ + Create SSLCertificate instance from a URL. + + Args: + url (str): URL of the website. + timeout (int): Timeout for the connection (default: 10). + + Returns: + Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise. + """ try: hostname = urlparse(url).netloc if ':' in hostname: @@ -73,7 +94,15 @@ class SSLCertificate: return data def to_json(self, filepath: Optional[str] = None) -> Optional[str]: - """Export certificate as JSON.""" + """ + Export certificate as JSON. + + Args: + filepath (Optional[str]): Path to save the JSON file (default: None). + + Returns: + Optional[str]: JSON string if successful, None otherwise. + """ json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False) if filepath: Path(filepath).write_text(json_str, encoding='utf-8') @@ -81,7 +110,15 @@ class SSLCertificate: return json_str def to_pem(self, filepath: Optional[str] = None) -> Optional[str]: - """Export certificate as PEM.""" + """ + Export certificate as PEM. + + Args: + filepath (Optional[str]): Path to save the PEM file (default: None). + + Returns: + Optional[str]: PEM string if successful, None otherwise. + """ try: x509 = OpenSSL.crypto.load_certificate( OpenSSL.crypto.FILETYPE_ASN1, @@ -100,7 +137,15 @@ class SSLCertificate: return None def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]: - """Export certificate as DER.""" + """ + Export certificate as DER. + + Args: + filepath (Optional[str]): Path to save the DER file (default: None). + + Returns: + Optional[bytes]: DER bytes if successful, None otherwise. + """ try: der_data = base64.b64decode(self._cert_info['raw_cert']) if filepath: diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index a1f3a49e..6679bb1b 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -4,6 +4,34 @@ import re class UserAgentGenerator: + """ + Generate random user agents with specified constraints. + + Attributes: + desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings. + mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings. + browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings. + rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings. + chrome_versions (list): A list of possible Chrome browser versions. + firefox_versions (list): A list of possible Firefox browser versions. + edge_versions (list): A list of possible Edge browser versions. + safari_versions (list): A list of possible Safari browser versions. + ios_versions (list): A list of possible iOS browser versions. + android_versions (list): A list of possible Android browser versions. + + Methods: + generate_user_agent( + platform: Literal["desktop", "mobile"] = "desktop", + browser: str = "chrome", + rendering_engine: str = "chrome_webkit", + chrome_version: Optional[str] = None, + firefox_version: Optional[str] = None, + edge_version: Optional[str] = None, + safari_version: Optional[str] = None, + ios_version: Optional[str] = None, + android_version: Optional[str] = None + ): Generates a random user agent string based on the specified parameters. + """ def __init__(self): # Previous platform definitions remain the same... self.desktop_platforms = { @@ -105,7 +133,21 @@ class UserAgentGenerator: ] def get_browser_stack(self, num_browsers: int = 1) -> List[str]: - """Get a valid combination of browser versions""" + """ + Get a valid combination of browser versions. + + How it works: + 1. Check if the number of browsers is supported. + 2. Randomly choose a combination of browsers. + 3. Iterate through the combination and add browser versions. + 4. Return the browser stack. + + Args: + num_browsers: Number of browser specifications (1-3) + + Returns: + List[str]: A list of browser versions. + """ if num_browsers not in self.browser_combinations: raise ValueError(f"Unsupported number of browsers: {num_browsers}") diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index de08e02b..214ebbc6 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -25,64 +25,91 @@ from functools import wraps class InvalidCSSSelectorError(Exception): pass -def create_box_message( - message: str, - type: str = "info", - width: int = 120, - add_newlines: bool = True, - double_line: bool = False -) -> str: - init() - - # Define border and text colors for different types - styles = { - "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), - "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), - "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), - "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), - } - - border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) - - # Define box characters based on line style - box_chars = { - "single": ("─", "│", "┌", "┐", "└", "┘"), - "double": ("═", "║", "╔", "╗", "╚", "╝") - } - line_style = "double" if double_line else "single" - h_line, v_line, tl, tr, bl, br = box_chars[line_style] - - # Process lines with lighter text color - formatted_lines = [] - raw_lines = message.split('\n') - - if raw_lines: - first_line = f"{prefix} {raw_lines[0].strip()}" - wrapped_first = textwrap.fill(first_line, width=width-4) - formatted_lines.extend(wrapped_first.split('\n')) - - for line in raw_lines[1:]: - if line.strip(): - wrapped = textwrap.fill(f" {line.strip()}", width=width-4) - formatted_lines.extend(wrapped.split('\n')) - else: - formatted_lines.append("") - - # Create the box with colored borders and lighter text - horizontal_line = h_line * (width - 1) - box = [ - f"{border_color}{tl}{horizontal_line}{tr}", - *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], - f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" - ] - - result = "\n".join(box) - if add_newlines: - result = f"\n{result}\n" - - return result +def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str: + """ + Create a styled message box with colored borders and formatted text. + + How it works: + 1. Determines box style and colors based on the message type (e.g., info, warning). + 2. Wraps text to fit within the specified width. + 3. Constructs a box using characters (single or double lines) with appropriate formatting. + 4. Adds optional newlines before and after the box. + + Args: + message (str): The message to display inside the box. + type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info". + width (int): Width of the box. Defaults to 120. + add_newlines (bool): Whether to add newlines before and after the box. Defaults to True. + double_line (bool): Whether to use double lines for the box border. Defaults to False. + + Returns: + str: A formatted string containing the styled message box. + """ + + init() + + # Define border and text colors for different types + styles = { + "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), + "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), + "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), + "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), + } + + border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) + + # Define box characters based on line style + box_chars = { + "single": ("─", "│", "┌", "┐", "└", "┘"), + "double": ("═", "║", "╔", "╗", "╚", "╝") + } + line_style = "double" if double_line else "single" + h_line, v_line, tl, tr, bl, br = box_chars[line_style] + + # Process lines with lighter text color + formatted_lines = [] + raw_lines = message.split('\n') + + if raw_lines: + first_line = f"{prefix} {raw_lines[0].strip()}" + wrapped_first = textwrap.fill(first_line, width=width-4) + formatted_lines.extend(wrapped_first.split('\n')) + + for line in raw_lines[1:]: + if line.strip(): + wrapped = textwrap.fill(f" {line.strip()}", width=width-4) + formatted_lines.extend(wrapped.split('\n')) + else: + formatted_lines.append("") + + # Create the box with colored borders and lighter text + horizontal_line = h_line * (width - 1) + box = [ + f"{border_color}{tl}{horizontal_line}{tr}", + *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], + f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" + ] + + result = "\n".join(box) + if add_newlines: + result = f"\n{result}\n" + + return result def calculate_semaphore_count(): + """ + Calculate the optimal semaphore count based on system resources. + + How it works: + 1. Determines the number of CPU cores and total system memory. + 2. Sets a base count as half of the available CPU cores. + 3. Limits the count based on memory, assuming 2GB per semaphore instance. + 4. Returns the minimum value between CPU and memory-based limits. + + Returns: + int: The calculated semaphore count. + """ + cpu_count = os.cpu_count() memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB base_count = max(1, cpu_count // 2) @@ -90,6 +117,21 @@ def calculate_semaphore_count(): return min(base_count, memory_based_cap) def get_system_memory(): + """ + Get the total system memory in bytes. + + How it works: + 1. Detects the operating system. + 2. Reads memory information from system-specific commands or files. + 3. Converts the memory to bytes for uniformity. + + Returns: + int: The total system memory in bytes. + + Raises: + OSError: If the operating system is unsupported. + """ + system = platform.system() if system == "Linux": with open('/proc/meminfo', 'r') as mem: @@ -124,6 +166,18 @@ def get_system_memory(): raise OSError("Unsupported operating system") def get_home_folder(): + """ + Get or create the home folder for Crawl4AI configuration and cache. + + How it works: + 1. Uses environment variables or defaults to the user's home directory. + 2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist. + 3. Returns the path to the home folder. + + Returns: + str: The path to the Crawl4AI home folder. + """ + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) @@ -194,6 +248,20 @@ def split_and_parse_json_objects(json_string): return parsed_objects, unparsed_segments def sanitize_html(html): + """ + Sanitize an HTML string by escaping quotes. + + How it works: + 1. Replaces all unwanted and special characters with an empty string. + 2. Escapes double and single quotes for safe usage. + + Args: + html (str): The HTML string to sanitize. + + Returns: + str: The sanitized HTML string. + """ + # Replace all unwanted and special characters with an empty string sanitized_html = html # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) @@ -248,6 +316,23 @@ def escape_json_string(s): return s def replace_inline_tags(soup, tags, only_text=False): + """ + Replace inline HTML tags with Markdown-style equivalents. + + How it works: + 1. Maps specific tags (e.g., , ) to Markdown syntax. + 2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object. + 3. Optionally replaces tags with their text content only. + + Args: + soup (BeautifulSoup): Parsed HTML content. + tags (List[str]): List of tags to replace. + only_text (bool): Whether to replace tags with plain text. Defaults to False. + + Returns: + BeautifulSoup: Updated BeautifulSoup object with replaced tags. + """ + tag_replacements = { 'b': lambda tag: f"**{tag.text}**", 'i': lambda tag: f"*{tag.text}*", @@ -292,6 +377,26 @@ def replace_inline_tags(soup, tags, only_text=False): # return soup def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs): + """ + Extract structured content, media, and links from website HTML. + + How it works: + 1. Parses the HTML content using BeautifulSoup. + 2. Extracts internal/external links and media (images, videos, audios). + 3. Cleans the content by removing unwanted tags and attributes. + 4. Converts cleaned HTML to Markdown. + 5. Collects metadata and returns the extracted information. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD. + css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None. + + Returns: + Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata. + """ + try: if not html: return None @@ -762,6 +867,27 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: } def extract_metadata(html, soup=None): + """ + Extract optimized content, media, and links from website HTML. + + How it works: + 1. Similar to `get_content_of_website`, but optimized for performance. + 2. Filters and scores images for usefulness. + 3. Extracts contextual descriptions for media files. + 4. Handles excluded tags and CSS selectors. + 5. Cleans HTML and converts it to Markdown. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD. + css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None. + **kwargs: Additional options for customization. + + Returns: + Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata. + """ + metadata = {} if not html and not soup: @@ -809,10 +935,35 @@ def extract_metadata(html, soup=None): return metadata def extract_xml_tags(string): + """ + Extracts XML tags from a string. + + Args: + string (str): The input string containing XML tags. + + Returns: + List[str]: A list of XML tags extracted from the input string. + """ tags = re.findall(r'<(\w+)>', string) return list(set(tags)) def extract_xml_data(tags, string): + """ + Extract data for specified XML tags from a string. + + How it works: + 1. Searches the string for each tag using regex. + 2. Extracts the content within the tags. + 3. Returns a dictionary of tag-content pairs. + + Args: + tags (List[str]): The list of XML tags to extract. + string (str): The input string containing XML data. + + Returns: + Dict[str, str]: A dictionary with tag names as keys and extracted content as values. + """ + data = {} for tag in tags: @@ -833,6 +984,26 @@ def perform_completion_with_backoff( base_url=None, **kwargs ): + """ + Perform an API completion request with exponential backoff. + + How it works: + 1. Sends a completion request to the API. + 2. Retries on rate-limit errors with exponential delays. + 3. Returns the API response or an error after all retries. + + Args: + provider (str): The name of the API provider. + prompt_with_variables (str): The input prompt for the completion request. + api_token (str): The API token for authentication. + json_response (bool): Whether to request a JSON response. Defaults to False. + base_url (Optional[str]): The base URL for the API. Defaults to None. + **kwargs: Additional arguments for the API request. + + Returns: + dict: The API response or an error message after all retries. + """ + from litellm import completion from litellm.exceptions import RateLimitError max_attempts = 3 @@ -878,6 +1049,25 @@ def perform_completion_with_backoff( }] def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None): + """ + Extract content blocks from website HTML using an AI provider. + + How it works: + 1. Prepares a prompt by sanitizing and escaping HTML. + 2. Sends the prompt to an AI provider with optional retries. + 3. Parses the response to extract structured blocks or errors. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER. + api_token (Optional[str]): The API token for authentication. Defaults to None. + base_url (Optional[str]): The base URL for the API. Defaults to None. + + Returns: + List[dict]: A list of extracted content blocks. + """ + # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token @@ -914,6 +1104,23 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas return blocks def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None): + """ + Extract content blocks from a batch of website HTMLs. + + How it works: + 1. Prepares prompts for each URL and HTML pair. + 2. Sends the prompts to the AI provider in a batch request. + 3. Parses the responses to extract structured blocks or errors. + + Args: + batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs. + provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192". + api_token (Optional[str]): The API token for authentication. Defaults to None. + + Returns: + List[dict]: A list of extracted content blocks from all batch items. + """ + api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token from litellm import batch_completion messages = [] @@ -986,6 +1193,25 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold): return merged_sections def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list: + """ + Process sections of HTML content sequentially or in parallel. + + How it works: + 1. Sequentially processes sections with delays for "groq/" providers. + 2. Uses ThreadPoolExecutor for parallel processing with other providers. + 3. Extracts content blocks for each section. + + Args: + url (str): The website URL. + sections (List[str]): The list of HTML sections to process. + provider (str): The AI provider for content extraction. + api_token (str): The API token for authentication. + base_url (Optional[str]): The base URL for the API. Defaults to None. + + Returns: + List[dict]: The list of extracted content blocks from all sections. + """ + extracted_content = [] if provider.startswith("groq/"): # Sequential processing with a delay @@ -1002,6 +1228,24 @@ def process_sections(url: str, sections: list, provider: str, api_token: str, ba return extracted_content def wrap_text(draw, text, font, max_width): + """ + Wrap text to fit within a specified width for rendering. + + How it works: + 1. Splits the text into words. + 2. Constructs lines that fit within the maximum width using the provided font. + 3. Returns the wrapped text as a single string. + + Args: + draw (ImageDraw.Draw): The drawing context for measuring text size. + text (str): The text to wrap. + font (ImageFont.FreeTypeFont): The font to use for measuring text size. + max_width (int): The maximum width for each line. + + Returns: + str: The wrapped text. + """ + # Wrap the text to fit within the specified width lines = [] words = text.split() @@ -1013,6 +1257,21 @@ def wrap_text(draw, text, font, max_width): return '\n'.join(lines) def format_html(html_string): + """ + Prettify an HTML string using BeautifulSoup. + + How it works: + 1. Parses the HTML string with BeautifulSoup. + 2. Formats the HTML with proper indentation. + 3. Returns the prettified HTML string. + + Args: + html_string (str): The HTML string to format. + + Returns: + str: The prettified HTML string. + """ + soup = BeautifulSoup(html_string, 'lxml.parser') return soup.prettify() @@ -1110,7 +1369,20 @@ def normalize_url_tmp(href, base_url): return href.strip() def get_base_domain(url: str) -> str: - """Extract base domain from URL, handling various edge cases.""" + """ + Extract the base domain from a given URL, handling common edge cases. + + How it works: + 1. Parses the URL to extract the domain. + 2. Removes the port number and 'www' prefix. + 3. Handles special domains (e.g., 'co.uk') to extract the correct base. + + Args: + url (str): The URL to extract the base domain from. + + Returns: + str: The extracted base domain or an empty string if parsing fails. + """ try: # Get domain from URL domain = urlparse(url).netloc.lower() @@ -1136,7 +1408,20 @@ def get_base_domain(url: str) -> str: return "" def is_external_url(url: str, base_domain: str) -> bool: - """Check if URL is external to base domain.""" + """ + Extract the base domain from a given URL, handling common edge cases. + + How it works: + 1. Parses the URL to extract the domain. + 2. Removes the port number and 'www' prefix. + 3. Handles special domains (e.g., 'co.uk') to extract the correct base. + + Args: + url (str): The URL to extract the base domain from. + + Returns: + str: The extracted base domain or an empty string if parsing fails. + """ special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} if any(url.lower().startswith(p) for p in special): return True @@ -1155,8 +1440,22 @@ def is_external_url(url: str, base_domain: str) -> bool: except Exception: return False - def clean_tokens(tokens: list[str]) -> list[str]: + """ + Clean a list of tokens by removing noise, stop words, and short tokens. + + How it works: + 1. Defines a set of noise words and stop words. + 2. Filters tokens based on length and exclusion criteria. + 3. Excludes tokens starting with certain symbols (e.g., "↑", "▲"). + + Args: + tokens (list[str]): The list of tokens to clean. + + Returns: + list[str]: The cleaned list of tokens. + """ + # Set of tokens to remove noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'} @@ -1212,6 +1511,21 @@ def clean_tokens(tokens: list[str]) -> list[str]: and not token.startswith('⬆')] def profile_and_time(func): + """ + Decorator to profile a function's execution time and performance. + + How it works: + 1. Records the start time before executing the function. + 2. Profiles the function's execution using `cProfile`. + 3. Prints the elapsed time and profiling statistics. + + Args: + func (Callable): The function to decorate. + + Returns: + Callable: The decorated function with profiling and timing enabled. + """ + @wraps(func) def wrapper(self, *args, **kwargs): # Start timer diff --git a/docs/examples/amazon_product_extraction_direct_url.py b/docs/examples/amazon_product_extraction_direct_url.py new file mode 100644 index 00000000..769c479e --- /dev/null +++ b/docs/examples/amazon_product_extraction_direct_url.py @@ -0,0 +1,114 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + browser_type="chromium", + headless=True + ) + + # Initialize crawler config with JSON CSS extraction strategy + crawler_config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + # Example search URL (you should replace with your actual Amazon URL) + url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab" + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/amazon_product_extraction_using_hooks.py b/docs/examples/amazon_product_extraction_using_hooks.py new file mode 100644 index 00000000..a17d60c5 --- /dev/null +++ b/docs/examples/amazon_product_extraction_using_hooks.py @@ -0,0 +1,145 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json +from playwright.async_api import Page, BrowserContext + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + # browser_type="chromium", + headless=True + ) + + # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + url = "https://www.amazon.com/" + + async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs): + """Hook called after navigating to each URL""" + print(f"[HOOK] after_goto - Successfully loaded: {url}") + + try: + # Wait for search box to be available + search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000) + + # Type the search query + await search_box.fill('Samsung Galaxy Tab') + + # Get the search button and prepare for navigation + search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000) + + # Click with navigation waiting + await search_button.click() + + # Wait for search results to load + await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000) + print("[HOOK] Search completed and results loaded!") + + except Exception as e: + print(f"[HOOK] Error during search operation: {str(e)}") + + return page + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + + crawler.crawler_strategy.set_hook("after_goto", after_goto) + + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/amazon_product_extraction_using_use_javascript.py b/docs/examples/amazon_product_extraction_using_use_javascript.py new file mode 100644 index 00000000..15e5d6f5 --- /dev/null +++ b/docs/examples/amazon_product_extraction_using_use_javascript.py @@ -0,0 +1,129 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json +from playwright.async_api import Page, BrowserContext + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + # browser_type="chromium", + headless=True + ) + + js_code_to_search = """ + const task = async () => { + document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab'; + document.querySelector('#nav-search-submit-button').click(); + } + await task(); + """ + js_code_to_search_sync = """ + document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab'; + document.querySelector('#nav-search-submit-button').click(); + """ + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + js_code = js_code_to_search, + wait_for='css:[data-component-type="s-search-result"]', + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + # Example search URL (you should replace with your actual Amazon URL) + url = "https://www.amazon.com/" + + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index ff312688..4c4a9d86 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -1,6 +1,8 @@ import os, sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) -os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692" + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) import asyncio import time @@ -12,7 +14,10 @@ from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai.extraction_strategy import ( + JsonCssExtractionStrategy, + LLMExtractionStrategy, +) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) @@ -21,128 +26,182 @@ print("GitHub Repository: https://github.com/unclecode/crawl4ai") print("Twitter: @unclecode") print("Website: https://crawl4ai.com") + # Basic Example - Simple Crawl async def simple_crawl(): print("\n--- Basic Usage ---") browser_config = BrowserConfig(headless=True) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS - ) - + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=crawler_config + url="https://www.nbcnews.com/business", config=crawler_config ) print(result.markdown[:500]) + +async def clean_content(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + excluded_tags=["nav", "footer", "aside"], + remove_overlay_elements=True, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ), + options={"ignore_links": True}, + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + config=crawler_config, + ) + full_markdown_length = len(result.markdown_v2.raw_markdown) + fit_markdown_length = len(result.markdown_v2.fit_markdown) + print(f"Full Markdown Length: {full_markdown_length}") + print(f"Fit Markdown Length: {fit_markdown_length}") + +async def link_analysis(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + exclude_external_links=True, + exclude_social_media_links=True, + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config, + ) + print(f"Found {len(result.links['internal'])} internal links") + print(f"Found {len(result.links['external'])} external links") + + for link in result.links['internal'][:5]: + print(f"Href: {link['href']}\nText: {link['text']}\n") + # JavaScript Execution Example async def simple_example_with_running_js_code(): print("\n--- Executing JavaScript and Using CSS Selectors ---") - - browser_config = BrowserConfig( - headless=True, - java_script_enabled=True - ) - + + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, - js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"], + js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();", # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }" ) - + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=crawler_config + url="https://www.nbcnews.com/business", config=crawler_config ) print(result.markdown[:500]) + # CSS Selector Example async def simple_example_with_css_selector(): print("\n--- Using CSS Selectors ---") browser_config = BrowserConfig(headless=True) crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - css_selector=".wide-tease-item__description" + cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description" ) - + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + +async def media_handling(): + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True) + async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", config=crawler_config ) - print(result.markdown[:500]) + for img in result.media['images'][:5]: + print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}") + +async def custom_hook_workflow(verbose=True): + async with AsyncWebCrawler() as crawler: + # Set a 'before_goto' hook to run custom code just before navigation + crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate...")) + + # Perform the crawl operation + result = await crawler.arun( + url="https://crawl4ai.com" + ) + print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- ")) + # Proxy Example async def use_proxy(): print("\n--- Using a Proxy ---") browser_config = BrowserConfig( headless=True, - proxy="http://your-proxy-url:port" + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "username", + "password": "password", + }, ) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS - ) - + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=crawler_config + url="https://www.nbcnews.com/business", config=crawler_config ) if result.success: print(result.markdown[:500]) + # Screenshot Example async def capture_and_save_screenshot(url: str, output_path: str): browser_config = BrowserConfig(headless=True) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - screenshot=True - ) - + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True) + async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - url=url, - config=crawler_config - ) - + result = await crawler.arun(url=url, config=crawler_config) + if result.success and result.screenshot: import base64 + screenshot_data = base64.b64decode(result.screenshot) - with open(output_path, 'wb') as f: + with open(output_path, "wb") as f: f.write(screenshot_data) print(f"Screenshot saved successfully to {output_path}") else: print("Failed to capture screenshot") + # LLM Extraction Example class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") - output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + output_fee: str = Field( + ..., description="Fee for output token for the OpenAI model." + ) -async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): + +async def extract_structured_data_using_llm( + provider: str, api_token: str = None, extra_headers: Dict[str, str] = None +): print(f"\n--- Extracting Structured Data with {provider} ---") - + if api_token is None and provider != "ollama": print(f"API token is required for {provider}. Skipping this example.") return browser_config = BrowserConfig(headless=True) - - extra_args = { - "temperature": 0, - "top_p": 0.9, - "max_tokens": 2000 - } + + extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000} if extra_headers: extra_args["extra_headers"] = extra_headers crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, word_count_threshold=1, - page_timeout = 80000, + page_timeout=80000, extraction_strategy=LLMExtractionStrategy( provider=provider, api_token=api_token, @@ -150,17 +209,17 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. Do not miss any models in the entire content.""", - extra_args=extra_args - ) + extra_args=extra_args, + ), ) - + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://openai.com/api/pricing/", - config=crawler_config + url="https://openai.com/api/pricing/", config=crawler_config ) print(result.extracted_content) + # CSS Extraction Example async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") @@ -192,16 +251,13 @@ async def extract_structured_data_using_css_extractor(): "name": "course_icon", "selector": ".image-92", "type": "attribute", - "attribute": "src" - } - ] + "attribute": "src", + }, + ], } - browser_config = BrowserConfig( - headless=True, - java_script_enabled=True - ) - + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + js_click_tabs = """ (async () => { const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); @@ -212,23 +268,23 @@ async def extract_structured_data_using_css_extractor(): } })(); """ - + crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=JsonCssExtractionStrategy(schema), - js_code=[js_click_tabs] + js_code=[js_click_tabs], ) - + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.kidocode.com/degrees/technology", - config=crawler_config + url="https://www.kidocode.com/degrees/technology", config=crawler_config ) companies = json.loads(result.extracted_content) print(f"Successfully extracted {len(companies)} companies") print(json.dumps(companies[0], indent=2)) + # Dynamic Content Examples - Method 1 async def crawl_dynamic_content_pages_method_1(): print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") @@ -249,10 +305,7 @@ async def crawl_dynamic_content_pages_method_1(): except Exception as e: print(f"Warning: New content didn't appear after JavaScript execution: {e}") - browser_config = BrowserConfig( - headless=False, - java_script_enabled=True - ) + browser_config = BrowserConfig(headless=False, java_script_enabled=True) async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) @@ -272,7 +325,7 @@ async def crawl_dynamic_content_pages_method_1(): css_selector="li.Box-sc-g0xbh4-0", js_code=js_next_page if page > 0 else None, js_only=page > 0, - session_id=session_id + session_id=session_id, ) result = await crawler.arun(url=url, config=crawler_config) @@ -286,14 +339,12 @@ async def crawl_dynamic_content_pages_method_1(): print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + # Dynamic Content Examples - Method 2 async def crawl_dynamic_content_pages_method_2(): print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") - browser_config = BrowserConfig( - headless=False, - java_script_enabled=True - ) + browser_config = BrowserConfig(headless=False, java_script_enabled=True) js_next_page_and_wait = """ (async () => { @@ -343,7 +394,7 @@ async def crawl_dynamic_content_pages_method_2(): extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, js_only=page > 0, - session_id=session_id + session_id=session_id, ) result = await crawler.arun(url=url, config=crawler_config) @@ -355,88 +406,128 @@ async def crawl_dynamic_content_pages_method_2(): print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + +async def cosine_similarity_extraction(): + crawl_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=CosineStrategy( + word_count_threshold=10, + max_dist=0.2, # Maximum distance between two words + linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single) + top_k=3, # Number of top keywords to extract + sim_threshold=0.3, # Similarity threshold for clustering + semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings + verbose=True + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156", + config=crawl_config + ) + print(json.loads(result.extracted_content)[:5]) + # Browser Comparison async def crawl_custom_browser_type(): print("\n--- Browser Comparison ---") - + # Firefox - browser_config_firefox = BrowserConfig( - browser_type="firefox", - headless=True - ) + browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True) start = time.time() async with AsyncWebCrawler(config=browser_config_firefox) as crawler: result = await crawler.arun( url="https://www.example.com", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), ) print("Firefox:", time.time() - start) print(result.markdown[:500]) # WebKit - browser_config_webkit = BrowserConfig( - browser_type="webkit", - headless=True - ) + browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True) start = time.time() async with AsyncWebCrawler(config=browser_config_webkit) as crawler: result = await crawler.arun( url="https://www.example.com", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), ) print("WebKit:", time.time() - start) print(result.markdown[:500]) # Chromium (default) - browser_config_chromium = BrowserConfig( - browser_type="chromium", - headless=True - ) + browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True) start = time.time() async with AsyncWebCrawler(config=browser_config_chromium) as crawler: result = await crawler.arun( url="https://www.example.com", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), ) print("Chromium:", time.time() - start) print(result.markdown[:500]) + # Anti-Bot and User Simulation async def crawl_with_user_simulation(): browser_config = BrowserConfig( headless=True, user_agent_mode="random", - user_agent_generator_config={ - "device_type": "mobile", - "os_type": "android" - } + user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, ) crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, magic=True, simulate_user=True, - override_navigator=True + override_navigator=True, ) async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - url="YOUR-URL-HERE", - config=crawler_config - ) + result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config) print(result.markdown) +async def ssl_certification(): + # Configure crawler to fetch SSL certificate + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url='https://example.com', + config=config + ) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + # 1. Access certificate properties directly + print("\nCertificate Information:") + print(f"Issuer: {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # 2. Export certificate in different formats + cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis + print("\nCertificate exported to:") + print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}") + + pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers + print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}") + + der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps + print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") + # Speed Comparison async def speed_comparison(): print("\n--- Speed Comparison ---") - + # Firecrawl comparison from firecrawl import FirecrawlApp - app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) + + app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) start = time.time() scrape_status = app.scrape_url( - 'https://www.nbcnews.com/business', - params={'formats': ['markdown', 'html']} + "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]} ) end = time.time() print("Firecrawl:") @@ -447,16 +538,15 @@ async def speed_comparison(): # Crawl4AI comparisons browser_config = BrowserConfig(headless=True) - + # Simple crawl async with AsyncWebCrawler(config=browser_config) as crawler: start = time.time() result = await crawler.arun( url="https://www.nbcnews.com/business", config=CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - word_count_threshold=0 - ) + cache_mode=CacheMode.BYPASS, word_count_threshold=0 + ), ) end = time.time() print("Crawl4AI (simple crawl):") @@ -474,12 +564,10 @@ async def speed_comparison(): word_count_threshold=0, markdown_generator=DefaultMarkdownGenerator( content_filter=PruningContentFilter( - threshold=0.48, - threshold_type="fixed", - min_word_threshold=0 + threshold=0.48, threshold_type="fixed", min_word_threshold=0 ) - ) - ) + ), + ), ) end = time.time() print("Crawl4AI (Markdown Plus):") @@ -489,22 +577,25 @@ async def speed_comparison(): print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") print() + # Main execution async def main(): # Basic examples # await simple_crawl() # await simple_example_with_running_js_code() # await simple_example_with_css_selector() - + # Advanced examples # await extract_structured_data_using_css_extractor() - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm( + "openai/gpt-4o", os.getenv("OPENAI_API_KEY") + ) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() - + # Browser comparisons # await crawl_custom_browser_type() - + # Performance testing # await speed_comparison() @@ -514,5 +605,6 @@ async def main(): # os.path.join(__location__, "tmp/example_screenshot.jpg") # ) + if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index bd4c425f..e640e6bd 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -627,13 +627,13 @@ async def main(): # } # await extract_structured_data_using_llm(extra_headers=custom_headers) - await crawl_dynamic_content_pages_method_1() - await crawl_dynamic_content_pages_method_2() + # await crawl_dynamic_content_pages_method_1() + # await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3() - await crawl_custom_browser_type() + # await crawl_custom_browser_type() - await speed_comparison() + # await speed_comparison() if __name__ == "__main__": diff --git a/docs/examples/tmp/chainlit_review.py b/docs/examples/tmp/chainlit_review.py deleted file mode 100644 index 2c03d17d..00000000 --- a/docs/examples/tmp/chainlit_review.py +++ /dev/null @@ -1,281 +0,0 @@ -from openai import AsyncOpenAI -from chainlit.types import ThreadDict -import chainlit as cl -from chainlit.input_widget import Select, Switch, Slider -client = AsyncOpenAI() - -# Instrument the OpenAI client -cl.instrument_openai() - -settings = { - "model": "gpt-3.5-turbo", - "temperature": 0.5, - "max_tokens": 500, - "top_p": 1, - "frequency_penalty": 0, - "presence_penalty": 0, -} - -@cl.action_callback("action_button") -async def on_action(action: cl.Action): - print("The user clicked on the action button!") - - return "Thank you for clicking on the action button!" - -@cl.set_chat_profiles -async def chat_profile(): - return [ - cl.ChatProfile( - name="GPT-3.5", - markdown_description="The underlying LLM model is **GPT-3.5**.", - icon="https://picsum.photos/200", - ), - cl.ChatProfile( - name="GPT-4", - markdown_description="The underlying LLM model is **GPT-4**.", - icon="https://picsum.photos/250", - ), - ] - -@cl.on_chat_start -async def on_chat_start(): - - settings = await cl.ChatSettings( - [ - Select( - id="Model", - label="OpenAI - Model", - values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"], - initial_index=0, - ), - Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True), - Slider( - id="Temperature", - label="OpenAI - Temperature", - initial=1, - min=0, - max=2, - step=0.1, - ), - Slider( - id="SAI_Steps", - label="Stability AI - Steps", - initial=30, - min=10, - max=150, - step=1, - description="Amount of inference steps performed on image generation.", - ), - Slider( - id="SAI_Cfg_Scale", - label="Stability AI - Cfg_Scale", - initial=7, - min=1, - max=35, - step=0.1, - description="Influences how strongly your generation is guided to match your prompt.", - ), - Slider( - id="SAI_Width", - label="Stability AI - Image Width", - initial=512, - min=256, - max=2048, - step=64, - tooltip="Measured in pixels", - ), - Slider( - id="SAI_Height", - label="Stability AI - Image Height", - initial=512, - min=256, - max=2048, - step=64, - tooltip="Measured in pixels", - ), - ] - ).send() - - chat_profile = cl.user_session.get("chat_profile") - await cl.Message( - content=f"starting chat using the {chat_profile} chat profile" - ).send() - - print("A new chat session has started!") - cl.user_session.set("session", { - "history": [], - "context": [] - }) - - image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline") - - # Attach the image to the message - await cl.Message( - content="You are such a good girl, aren't you?!", - elements=[image], - ).send() - - text_content = "Hello, this is a text element." - elements = [ - cl.Text(name="simple_text", content=text_content, display="inline") - ] - - await cl.Message( - content="Check out this text element!", - elements=elements, - ).send() - - elements = [ - cl.Audio(path="./assets/audio.mp3", display="inline"), - ] - await cl.Message( - content="Here is an audio file", - elements=elements, - ).send() - - await cl.Avatar( - name="Tool 1", - url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4", - ).send() - - await cl.Message( - content="This message should not have an avatar!", author="Tool 0" - ).send() - - await cl.Message( - content="This message should have an avatar!", author="Tool 1" - ).send() - - elements = [ - cl.File( - name="quickstart.py", - path="./quickstart.py", - display="inline", - ), - ] - - await cl.Message( - content="This message has a file element", elements=elements - ).send() - - # Sending an action button within a chatbot message - actions = [ - cl.Action(name="action_button", value="example_value", description="Click me!") - ] - - await cl.Message(content="Interact with this action button:", actions=actions).send() - - # res = await cl.AskActionMessage( - # content="Pick an action!", - # actions=[ - # cl.Action(name="continue", value="continue", label="✅ Continue"), - # cl.Action(name="cancel", value="cancel", label="❌ Cancel"), - # ], - # ).send() - - # if res and res.get("value") == "continue": - # await cl.Message( - # content="Continue!", - # ).send() - - # import plotly.graph_objects as go - # fig = go.Figure( - # data=[go.Bar(y=[2, 1, 3])], - # layout_title_text="An example figure", - # ) - # elements = [cl.Plotly(name="chart", figure=fig, display="inline")] - - # await cl.Message(content="This message has a chart", elements=elements).send() - - # Sending a pdf with the local file path - # elements = [ - # cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf") - # ] - - # cl.Message(content="Look at this local pdf!", elements=elements).send() - -@cl.on_settings_update -async def setup_agent(settings): - print("on_settings_update", settings) - -@cl.on_stop -def on_stop(): - print("The user wants to stop the task!") - -@cl.on_chat_end -def on_chat_end(): - print("The user disconnected!") - - -@cl.on_chat_resume -async def on_chat_resume(thread: ThreadDict): - print("The user resumed a previous chat session!") - - - - -# @cl.on_message -async def on_message(message: cl.Message): - cl.user_session.get("session")["history"].append({ - "role": "user", - "content": message.content - }) - response = await client.chat.completions.create( - messages=[ - { - "content": "You are a helpful bot", - "role": "system" - }, - *cl.user_session.get("session")["history"] - ], - **settings - ) - - - # Add assitanr message to the history - cl.user_session.get("session")["history"].append({ - "role": "assistant", - "content": response.choices[0].message.content - }) - - # msg.content = response.choices[0].message.content - # await msg.update() - - # await cl.Message(content=response.choices[0].message.content).send() - -@cl.on_message -async def on_message(message: cl.Message): - cl.user_session.get("session")["history"].append({ - "role": "user", - "content": message.content - }) - - msg = cl.Message(content="") - await msg.send() - - stream = await client.chat.completions.create( - messages=[ - { - "content": "You are a helpful bot", - "role": "system" - }, - *cl.user_session.get("session")["history"] - ], - stream = True, - **settings - ) - - async for part in stream: - if token := part.choices[0].delta.content or "": - await msg.stream_token(token) - - # Add assitanr message to the history - cl.user_session.get("session")["history"].append({ - "role": "assistant", - "content": msg.content - }) - await msg.update() - -if __name__ == "__main__": - from chainlit.cli import run_chainlit - run_chainlit(__file__) \ No newline at end of file diff --git a/docs/examples/tmp/research_assistant_audio_not_completed.py b/docs/examples/tmp/research_assistant_audio_not_completed.py deleted file mode 100644 index e0ad2b4f..00000000 --- a/docs/examples/tmp/research_assistant_audio_not_completed.py +++ /dev/null @@ -1,238 +0,0 @@ -# Make sure to install the required packageschainlit and groq -import os, time -from openai import AsyncOpenAI -import chainlit as cl -import re -import requests -from io import BytesIO -from chainlit.element import ElementBased -from groq import Groq - -# Import threadpools to run the crawl_url function in a separate thread -from concurrent.futures import ThreadPoolExecutor - -client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY")) - -# Instrument the OpenAI client -cl.instrument_openai() - -settings = { - "model": "llama3-8b-8192", - "temperature": 0.5, - "max_tokens": 500, - "top_p": 1, - "frequency_penalty": 0, - "presence_penalty": 0, -} - -def extract_urls(text): - url_pattern = re.compile(r'(https?://\S+)') - return url_pattern.findall(text) - -def crawl_url(url): - data = { - "urls": [url], - "include_raw_html": True, - "word_count_threshold": 10, - "extraction_strategy": "NoExtractionStrategy", - "chunking_strategy": "RegexChunking" - } - response = requests.post("https://crawl4ai.com/crawl", json=data) - response_data = response.json() - response_data = response_data['results'][0] - return response_data['markdown'] - -@cl.on_chat_start -async def on_chat_start(): - cl.user_session.set("session", { - "history": [], - "context": {} - }) - await cl.Message( - content="Welcome to the chat! How can I assist you today?" - ).send() - -@cl.on_message -async def on_message(message: cl.Message): - user_session = cl.user_session.get("session") - - # Extract URLs from the user's message - urls = extract_urls(message.content) - - - futures = [] - with ThreadPoolExecutor() as executor: - for url in urls: - futures.append(executor.submit(crawl_url, url)) - - results = [future.result() for future in futures] - - for url, result in zip(urls, results): - ref_number = f"REF_{len(user_session['context']) + 1}" - user_session["context"][ref_number] = { - "url": url, - "content": result - } - - # for url in urls: - # # Crawl the content of each URL and add it to the session context with a reference number - # ref_number = f"REF_{len(user_session['context']) + 1}" - # crawled_content = crawl_url(url) - # user_session["context"][ref_number] = { - # "url": url, - # "content": crawled_content - # } - - user_session["history"].append({ - "role": "user", - "content": message.content - }) - - # Create a system message that includes the context - context_messages = [ - f'\n{data["content"]}\n' - for ref, data in user_session["context"].items() - ] - if context_messages: - system_message = { - "role": "system", - "content": ( - "You are a helpful bot. Use the following context for answering questions. " - "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n" - "If the question requires any information from the provided appendices or context, refer to the sources. " - "If not, there is no need to add a references section. " - "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n" - "\n\n".join(context_messages) - ) - } - else: - system_message = { - "role": "system", - "content": "You are a helpful assistant." - } - - - msg = cl.Message(content="") - await msg.send() - - # Get response from the LLM - stream = await client.chat.completions.create( - messages=[ - system_message, - *user_session["history"] - ], - stream=True, - **settings - ) - - assistant_response = "" - async for part in stream: - if token := part.choices[0].delta.content: - assistant_response += token - await msg.stream_token(token) - - # Add assistant message to the history - user_session["history"].append({ - "role": "assistant", - "content": assistant_response - }) - await msg.update() - - # Append the reference section to the assistant's response - reference_section = "\n\nReferences:\n" - for ref, data in user_session["context"].items(): - reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n" - - msg.content += reference_section - await msg.update() - - -@cl.on_audio_chunk -async def on_audio_chunk(chunk: cl.AudioChunk): - if chunk.isStart: - buffer = BytesIO() - # This is required for whisper to recognize the file type - buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}" - # Initialize the session for a new audio stream - cl.user_session.set("audio_buffer", buffer) - cl.user_session.set("audio_mime_type", chunk.mimeType) - - # Write the chunks to a buffer and transcribe the whole audio at the end - cl.user_session.get("audio_buffer").write(chunk.data) - - pass - -@cl.step(type="tool") -async def speech_to_text(audio_file): - cli = Groq() - - # response = cli.audio.transcriptions.create( - # file=audio_file, #(filename, file.read()), - # model="whisper-large-v3", - # ) - - response = await client.audio.transcriptions.create( - model="whisper-large-v3", file=audio_file - ) - - return response.text - - -@cl.on_audio_end -async def on_audio_end(elements: list[ElementBased]): - # Get the audio buffer from the session - audio_buffer: BytesIO = cl.user_session.get("audio_buffer") - audio_buffer.seek(0) # Move the file pointer to the beginning - audio_file = audio_buffer.read() - audio_mime_type: str = cl.user_session.get("audio_mime_type") - - # input_audio_el = cl.Audio( - # mime=audio_mime_type, content=audio_file, name=audio_buffer.name - # ) - # await cl.Message( - # author="You", - # type="user_message", - # content="", - # elements=[input_audio_el, *elements] - # ).send() - - # answer_message = await cl.Message(content="").send() - - - start_time = time.time() - whisper_input = (audio_buffer.name, audio_file, audio_mime_type) - transcription = await speech_to_text(whisper_input) - end_time = time.time() - print(f"Transcription took {end_time - start_time} seconds") - - user_msg = cl.Message( - author="You", - type="user_message", - content=transcription - ) - await user_msg.send() - await on_message(user_msg) - - # images = [file for file in elements if "image" in file.mime] - - # text_answer = await generate_text_answer(transcription, images) - - # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type) - - # output_audio_el = cl.Audio( - # name=output_name, - # auto_play=True, - # mime=audio_mime_type, - # content=output_audio, - # ) - - # answer_message.elements = [output_audio_el] - - # answer_message.content = transcription - # await answer_message.update() - -if __name__ == "__main__": - from chainlit.cli import run_chainlit - run_chainlit(__file__) - - diff --git a/docs/llm.txt/1_introduction.md b/docs/llm.txt/1_introduction.md index c83c2495..b2deb414 100644 --- a/docs/llm.txt/1_introduction.md +++ b/docs/llm.txt/1_introduction.md @@ -2,20 +2,39 @@ Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extraction into AI-ready formats. Perfect for AI assistants, semantic search engines, or data pipelines, Crawl4AI transforms raw HTML into structured Markdown or JSON effortlessly. Integrate with LLMs, open-source models, or your own retrieval-augmented generation workflows. -**Key Links:** -- **Website:** [https://crawl4ai.com](https://crawl4ai.com) -- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) -- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) -- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) -- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) +**What Crawl4AI is not:** + +Crawl4AI is not a replacement for traditional web scraping libraries, Selenium, or Playwright. It's not designed as a general-purpose web automation tool. Instead, Crawl4AI has a specific, focused goal: + +- To generate perfect, AI-friendly data (particularly for LLMs) from web content +- To maximize speed and efficiency in data extraction and processing +- To operate at scale, from Raspberry Pi to cloud infrastructures + +Crawl4AI is engineered with a "scale-first" mindset, aiming to handle millions of links while maintaining exceptional performance. It's super efficient and fast, optimized to: + +1. Transform raw web content into structured, LLM-ready formats (Markdown/JSON) +2. Implement intelligent extraction strategies to reduce reliance on costly API calls +3. Provide a streamlined pipeline for AI data preparation and ingestion + +In essence, Crawl4AI bridges the gap between web content and AI systems, focusing on delivering high-quality, processed data rather than offering broad web automation capabilities. + +**Key Links:** + +- **Website:** [https://crawl4ai.com](https://crawl4ai.com) +- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) +- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) +- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) --- ## Table of Contents + - [Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling \& AI Integration Solution](#crawl4ai-quick-start-guide-your-all-in-one-ai-ready-web-crawling--ai-integration-solution) - [Table of Contents](#table-of-contents) - [1. Introduction \& Key Concepts](#1-introduction--key-concepts) - [2. Installation \& Environment Setup](#2-installation--environment-setup) + - [Test Your Installation](#test-your-installation) - [3. Core Concepts \& Configuration](#3-core-concepts--configuration) - [4. Basic Crawling \& Simple Extraction](#4-basic-crawling--simple-extraction) - [5. Markdown Generation \& AI-Optimized Output](#5-markdown-generation--ai-optimized-output) @@ -38,15 +57,17 @@ Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extract --- ## 1. Introduction & Key Concepts + Crawl4AI transforms websites into structured, AI-friendly data. It efficiently handles large-scale crawling, integrates with both proprietary and open-source LLMs, and optimizes content for semantic search or RAG pipelines. **Quick Test:** + ```python import asyncio from crawl4ai import AsyncWebCrawler async def test_run(): - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com") print(result.markdown) @@ -60,12 +81,41 @@ If you see Markdown output, everything is working! --- ## 2. Installation & Environment Setup + ```bash +# Install the package pip install crawl4ai crawl4ai-setup -playwright install chromium + +# Install Playwright with system dependencies (recommended) +playwright install --with-deps # Installs all browsers + +# Or install specific browsers: +playwright install --with-deps chrome # Recommended for Colab/Linux +playwright install --with-deps firefox +playwright install --with-deps webkit +playwright install --with-deps chromium + +# Keep Playwright updated periodically +playwright install ``` +> **Note**: For Google Colab and some Linux environments, use `chrome` instead of `chromium` - it tends to work more reliably. + +### Test Your Installation +Try these one-liners: + +```python +# Visible browser test +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')" + +# Headless test (for servers/CI) +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=True); page = browser.new_page(); page.goto('https://example.com'); print(f'Title: {page.title()}'); browser.close()" +``` + +You should see a browser window (in visible test) loading example.com. If you get errors, try with Firefox using `playwright install --with-deps firefox`. + + **Try in Colab:** [Open Colab Notebook](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) @@ -74,16 +124,19 @@ playwright install chromium --- ## 3. Core Concepts & Configuration + Use `AsyncWebCrawler`, `CrawlerRunConfig`, and `BrowserConfig` to control crawling. **Example config:** + ```python from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig browser_config = BrowserConfig( headless=True, - viewport_width=1920, - viewport_height=1080, + verbose=True, + viewport_width=1080, + viewport_height=600, text_mode=False, ignore_https_errors=True, java_script_enabled=True @@ -97,7 +150,7 @@ run_config = CrawlerRunConfig( wait_for="css:.article-loaded", page_timeout=60000, delay_before_return_html=1.0, - mean_delay=0.1, + mean_delay=0.1, max_range=0.3, process_iframes=True, remove_overlay_elements=True, @@ -115,15 +168,17 @@ run_config = CrawlerRunConfig( ``` **Prefixes:** -- `http://` or `https://` for live pages -- `file://local.html` for local -- `raw:` for raw HTML strings + +- `http://` or `https://` for live pages +- `file://local.html` for local +- `raw:` for raw HTML strings **More info:** [See /docs/async_webcrawler](#) or [3_async_webcrawler.ex.md](https://github.com/unclecode/crawl4ai/blob/main/async_webcrawler.ex.md) --- ## 4. Basic Crawling & Simple Extraction + ```python async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://news.example.com/article", config=run_config) @@ -137,13 +192,15 @@ async with AsyncWebCrawler(config=browser_config) as crawler: ## 5. Markdown Generation & AI-Optimized Output After crawling, `result.markdown_v2` provides: -- `raw_markdown`: Unfiltered markdown -- `markdown_with_citations`: Links as references at the bottom -- `references_markdown`: A separate list of reference links -- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25) -- `fit_html`: The HTML used to produce `fit_markdown` + +- `raw_markdown`: Unfiltered markdown +- `markdown_with_citations`: Links as references at the bottom +- `references_markdown`: A separate list of reference links +- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25) +- `fit_html`: The HTML used to produce `fit_markdown` **Example:** + ```python print("RAW:", result.markdown_v2.raw_markdown[:200]) print("CITED:", result.markdown_v2.markdown_with_citations[:200]) @@ -158,9 +215,11 @@ For AI training, `fit_markdown` focuses on the most relevant content. --- ## 6. Structured Data Extraction (CSS, XPath, LLM) + Extract JSON data without LLMs: **CSS:** + ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy @@ -176,6 +235,7 @@ run_config.extraction_strategy = JsonCssExtractionStrategy(schema) ``` **XPath:** + ```python from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy @@ -195,6 +255,7 @@ run_config.extraction_strategy = JsonXPathExtractionStrategy(xpath_schema) --- ## 7. Advanced Extraction: LLM & Open-Source Models + Use LLMExtractionStrategy for complex tasks. Works with OpenAI or open-source models (e.g., Ollama). ```python @@ -217,7 +278,9 @@ run_config.extraction_strategy = LLMExtractionStrategy( --- ## 8. Page Interactions, JS Execution, & Dynamic Content + Insert `js_code` and use `wait_for` to ensure content loads. Example: + ```python run_config.js_code = """ (async () => { @@ -233,6 +296,7 @@ run_config.wait_for = "css:.item-loaded" --- ## 9. Media, Links, & Metadata Handling + `result.media["images"]`: List of images with `src`, `score`, `alt`. Score indicates relevance. `result.media["videos"]`, `result.media["audios"]` similarly hold media info. @@ -242,6 +306,7 @@ run_config.wait_for = "css:.item-loaded" `result.metadata`: Title, description, keywords, author. **Example:** + ```python # Images for img in result.media["images"]: @@ -263,30 +328,37 @@ print("Description:", result.metadata["description"]) ## 10. Authentication & Identity Preservation ### Manual Setup via User Data Directory + 1. **Open Chrome with a custom user data dir:** - ```bash - "C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile" - ``` - On macOS: - ```bash - "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile" - ``` + + ```bash + "C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile" + ``` + + On macOS: + + ```bash + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile" + ``` 2. **Log in to sites, solve CAPTCHAs, adjust settings manually.** The browser saves cookies/localStorage in that directory. 3. **Use `user_data_dir` in `BrowserConfig`:** - ```python - browser_config = BrowserConfig( - headless=True, - user_data_dir="/Users/username/ChromeProfiles/MyProfile" - ) - ``` - Now the crawler starts with those cookies, sessions, etc. + ```python + browser_config = BrowserConfig( + headless=True, + user_data_dir="/Users/username/ChromeProfiles/MyProfile" + ) + ``` + + Now the crawler starts with those cookies, sessions, etc. ### Using `storage_state` + Alternatively, export and reuse storage states: + ```python browser_config = BrowserConfig( headless=True, @@ -301,7 +373,9 @@ No repeated logins needed. --- ## 11. Proxy & Security Enhancements + Use `proxy_config` for authenticated proxies: + ```python browser_config.proxy_config = { "server": "http://proxy.example.com:8080", @@ -317,6 +391,7 @@ Combine with `headers` or `ignore_https_errors` as needed. --- ## 12. Screenshots, PDFs & File Downloads + Enable `screenshot=True` or `pdf=True` in `CrawlerRunConfig`: ```python @@ -325,6 +400,7 @@ run_config.pdf = True ``` After crawling: + ```python if result.screenshot: with open("page.png", "wb") as f: @@ -336,6 +412,7 @@ if result.pdf: ``` **File Downloads:** + ```python browser_config.accept_downloads = True browser_config.downloads_path = "./downloads" @@ -351,7 +428,9 @@ Also [10_file_download.md](https://github.com/unclecode/crawl4ai/blob/main/file_ --- ## 13. Caching & Performance Optimization + Set `cache_mode` to reuse fetch results: + ```python from crawl4ai import CacheMode run_config.cache_mode = CacheMode.ENABLED @@ -364,11 +443,13 @@ Adjust delays, increase concurrency, or use `text_mode=True` for faster extracti --- ## 14. Hooks for Custom Logic + Hooks let you run code at specific lifecycle events without creating pages manually in `on_browser_created`. Use `on_page_context_created` to apply routing or modify page contexts before crawling the URL: **Example Hook:** + ```python async def on_page_context_created_hook(context, page, **kwargs): # Block all images to speed up load @@ -388,21 +469,25 @@ This hook is clean and doesn’t create a separate page itself—it just modifie --- ## 15. Dockerization & Scaling + Use Docker images: -- AMD64 basic: +- AMD64 basic: + ```bash docker pull unclecode/crawl4ai:basic-amd64 docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 ``` -- ARM64 for M1/M2: +- ARM64 for M1/M2: + ```bash docker pull unclecode/crawl4ai:basic-arm64 docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 ``` -- GPU support: +- GPU support: + ```bash docker pull unclecode/crawl4ai:gpu-amd64 docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu-amd64 @@ -415,25 +500,28 @@ Scale with load balancers or Kubernetes. --- ## 16. Troubleshooting & Common Pitfalls -- Empty results? Relax filters, check selectors. -- Timeouts? Increase `page_timeout` or refine `wait_for`. -- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving. -- JS errors? Try headful mode for debugging. + +- Empty results? Relax filters, check selectors. +- Timeouts? Increase `page_timeout` or refine `wait_for`. +- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving. +- JS errors? Try headful mode for debugging. Check [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) & [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for more code. --- ## 17. Comprehensive End-to-End Example + Combine hooks, JS execution, PDF saving, LLM extraction—see [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for a full example. --- ## 18. Further Resources & Community -- **Docs:** [https://crawl4ai.com](https://crawl4ai.com) -- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues) + +- **Docs:** [https://crawl4ai.com](https://crawl4ai.com) +- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues) Follow [@unclecode](https://x.com/unclecode) for news & community updates. **Happy Crawling!** -Leverage Crawl4AI to feed your AI models with clean, structured web data today. \ No newline at end of file +Leverage Crawl4AI to feed your AI models with clean, structured web data today. diff --git a/docs/llm.txt/2_configuration.md b/docs/llm.txt/2_configuration.md index df324644..2ec5cb8f 100644 --- a/docs/llm.txt/2_configuration.md +++ b/docs/llm.txt/2_configuration.md @@ -65,7 +65,7 @@ #### `viewport_width` and `viewport_height` - **Description**: Sets the default browser viewport dimensions. - - Default: `1920` (width), `1080` (height) + - Default: `1080` (width), `600` (height) - **Use Case**: - Adjust for crawling responsive layouts or specific device emulations. @@ -134,6 +134,19 @@ - **Use Case**: - Use for advanced browser configurations like WebRTC or GPU tuning. +#### `verbose` +- **Description**: Enable verbose logging of browser operations. + - Default: `True` +- **Use Case**: + - Enable for detailed logging during development and debugging. + - Disable in production for better performance. + +#### `sleep_on_close` +- **Description**: Adds a delay before closing the browser. + - Default: `False` +- **Use Case**: + - Enable when you need to ensure all browser operations are complete before closing. + ## CrawlerRunConfig The `CrawlerRunConfig` class centralizes parameters for controlling crawl operations. This configuration covers content extraction, page interactions, caching, and runtime behaviors. Below is an exhaustive breakdown of parameters and their best-use scenarios. @@ -341,3 +354,37 @@ The `CrawlerRunConfig` class centralizes parameters for controlling crawl operat - **Use Case**: - Enable when debugging JavaScript errors on pages. +##### `parser_type` +- **Description**: Type of parser to use for HTML parsing. + - Default: `"lxml"` +- **Use Case**: + - Use when specific HTML parsing requirements are needed. + - `"lxml"` provides good performance and standards compliance. + +##### `prettiify` +- **Description**: Apply `fast_format_html` to produce prettified HTML output. + - Default: `False` +- **Use Case**: + - Enable for better readability of extracted HTML content. + - Useful during development and debugging. + +##### `fetch_ssl_certificate` +- **Description**: Fetch and store SSL certificate information during crawling. + - Default: `False` +- **Use Case**: + - Enable when SSL certificate analysis is required. + - Useful for security audits and certificate validation. + +##### `url` +- **Description**: Target URL for the crawl operation. + - Default: `None` +- **Use Case**: + - Set when initializing a crawler for a specific URL. + - Can be overridden during actual crawl operations. + +##### `log_console` +- **Description**: Log browser console messages during crawling. + - Default: `False` +- **Use Case**: + - Enable to capture JavaScript console output. + - Useful for debugging JavaScript-heavy pages. diff --git a/docs/llm.txt/3_async_webcrawler.md b/docs/llm.txt/3_async_webcrawler.md index 7d122786..fb4250f8 100644 --- a/docs/llm.txt/3_async_webcrawler.md +++ b/docs/llm.txt/3_async_webcrawler.md @@ -3,6 +3,7 @@ This document provides a comprehensive, human-oriented overview of the `AsyncWebCrawler` class and related components from the `crawl4ai` package. It explains the motivations behind asynchronous crawling, shows how to configure and run crawls, and provides examples for advanced features like dynamic content handling, extraction strategies, caching, containerization, and troubleshooting. ## Introduction +[EDIT: This is not a good way to introduce the library. The library excels at generating crawl data in the form of markdown or extracted JSON as quickly as possible. It is designed to be efficient in terms of memory and CPU usage. Users should choose this library because it generates markdown suitable for large language models and AI. Additionally, it can create structured data, which is beneficial because it supports attaching large language models to generate structured data. It also includes techniques like JSON CSS and JSON XPath extraction, allowing users to define patterns and extract data quickly. One of the library's strengths is its ability to work everywhere. It can crawl any website by offering various capabilities, such as connecting to a remote browser or using persistent data. This feature allows developers to create their own identity on websites where they have authentication access, enabling them to crawl without being mistakenly identified as a bot. This is a better way to introduce the library. In these documents, we discuss the main object, the main class, Asinggull crawlers, and all the functionalities we can achieve with this Asinggull crawler.] Crawling websites can be slow if done sequentially, especially when handling large numbers of URLs or rendering dynamic pages. Asynchronous crawling helps you run multiple operations concurrently, improving throughput and performance. The `AsyncWebCrawler` class leverages asynchronous I/O and browser automation tools to fetch content efficiently, handle complex DOM interactions, and extract structured data. diff --git a/docs/llm.txt/5_markdown_generation.md b/docs/llm.txt/5_markdown_generation.md index 235e9f83..69534e54 100644 --- a/docs/llm.txt/5_markdown_generation.md +++ b/docs/llm.txt/5_markdown_generation.md @@ -74,9 +74,10 @@ The Markdown generation process transforms raw HTML into a structured format. At ```python from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator -from crawl4ai import CrawlerRunConfig, AsyncWebCrawler +from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( options={ "ignore_links": True, diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index 87e468aa..31d33e8c 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -310,22 +310,6 @@ response = requests.post("http://localhost:11235/crawl", json=request) > **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! - - - - - - - - - - - - - - - - ## Usage Examples 📝 ### Basic Crawling