From b6d6631b125bde49b402ba30ae22fc3fb4661228 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 12 Nov 2024 12:10:58 +0800 Subject: [PATCH 01/50] Enhance Async Crawler with Playwright support - Implemented new async crawler strategy using Playwright. - Introduced ManagedBrowser for better browser management. - Added support for persistent browser sessions and improved error handling. - Updated version from 0.3.73 to 0.3.731. - Enhanced logic in main.py for conditional mounting of static files. - Updated requirements to replace playwright_stealth with tf-playwright-stealth. --- crawl4ai/_version.py | 2 +- crawl4ai/async_crawler_strategy.py | 106 ++- crawl4ai/async_crawler_strategy_0.3.73.py | 965 ++++++++++++++++++++++ main.py | 12 +- requirements.txt | 2 +- 5 files changed, 1057 insertions(+), 30 deletions(-) create mode 100644 crawl4ai/async_crawler_strategy_0.3.73.py diff --git a/crawl4ai/_version.py b/crawl4ai/_version.py index 85030f0e..7ab71c9b 100644 --- a/crawl4ai/_version.py +++ b/crawl4ai/_version.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.73" \ No newline at end of file +__version__ = "0.3.731" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index fa50e7b5..896a0644 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -186,6 +186,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.sleep_on_close = kwargs.get("sleep_on_close", False) self.use_managed_browser = kwargs.get("use_managed_browser", False) self.user_data_dir = kwargs.get("user_data_dir", None) + self.use_persistent_context = kwargs.get("use_persistent_context", False) + self.chrome_channel = kwargs.get("chrome_channel", "chrome") self.managed_browser = None self.default_context = None self.hooks = { @@ -197,6 +199,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_return_html': None, 'before_retrieve_html': None } + self.extra_args = kwargs.get("extra_args", []) async def __aenter__(self): await self.start() @@ -238,36 +241,71 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "User-Agent": self.user_agent }) else: + # Base browser arguments browser_args = { "headless": self.headless, "args": [ - "--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", + "--no-first-run", + "--no-default-browser-check", "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", - # "--headless=new", # Use the new headless mode ] } + + # Add channel if specified (try Chrome first) + if self.chrome_channel: + browser_args["channel"] = self.chrome_channel + + # Add extra args if provided + if self.extra_args: + browser_args["args"].extend(self.extra_args) # Add proxy settings if a proxy is specified if self.proxy: proxy_settings = ProxySettings(server=self.proxy) browser_args["proxy"] = proxy_settings elif self.proxy_config: - proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) + proxy_settings = ProxySettings( + server=self.proxy_config.get("server"), + username=self.proxy_config.get("username"), + password=self.proxy_config.get("password") + ) browser_args["proxy"] = proxy_settings - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) + try: + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + except Exception as e: + # Fallback to chromium if Chrome channel fails + if "chrome" in str(e) and browser_args.get("channel") == "chrome": + browser_args["channel"] = "chromium" + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + else: + raise await self.execute_hook('on_browser_created', self.browser) @@ -461,24 +499,35 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if session_id: context, page, _ = self.sessions.get(session_id, (None, None, None)) if not context: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + page = await context.new_page() + else: + # Normal context creation for non-persistent or non-Chrome browsers + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=True, + java_script_enabled=True + ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + else: + # Normal context creation context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True + proxy={"server": self.proxy} if self.proxy else None ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None - ) - await context.set_extra_http_headers(self.headers) if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Inject scripts to override navigator properties @@ -512,7 +561,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """) page = await context.new_page() - # await stealth_async(page) #, stealth_config) + if kwargs.get("magic", False): + await stealth_async(page, stealth_config) # Add console message and error logging if kwargs.get("log_console", False): @@ -544,8 +594,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) + response = await page.goto( - url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + timeout=kwargs.get("page_timeout", 60000) ) # response = await page.goto("about:blank") diff --git a/crawl4ai/async_crawler_strategy_0.3.73.py b/crawl4ai/async_crawler_strategy_0.3.73.py new file mode 100644 index 00000000..54835dad --- /dev/null +++ b/crawl4ai/async_crawler_strategy_0.3.73.py @@ -0,0 +1,965 @@ +import asyncio +import base64 +import time +from abc import ABC, abstractmethod +from typing import Callable, Dict, Any, List, Optional, Awaitable +import os, sys, shutil +import tempfile, subprocess +from playwright.async_api import async_playwright, Page, Browser, Error +from io import BytesIO +from PIL import Image, ImageDraw, ImageFont +from pathlib import Path +from playwright.async_api import ProxySettings +from pydantic import BaseModel +import hashlib +import json +import uuid + +from playwright_stealth import StealthConfig, stealth_async + +stealth_config = StealthConfig( + webdriver=True, + chrome_app=True, + chrome_csi=True, + chrome_load_times=True, + chrome_runtime=True, + navigator_languages=True, + navigator_plugins=True, + navigator_permissions=True, + webgl_vendor=True, + outerdimensions=True, + navigator_hardware_concurrency=True, + media_codecs=True, +) + + +class ManagedBrowser: + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): + self.browser_type = browser_type + self.user_data_dir = user_data_dir + self.headless = headless + self.browser_process = None + self.temp_dir = None + self.debugging_port = 9222 + + async def start(self) -> str: + """ + Starts the browser process and returns the CDP endpoint URL. + If user_data_dir is not provided, creates a temporary directory. + """ + + # Create temp dir if needed + if not self.user_data_dir: + self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") + self.user_data_dir = self.temp_dir + + # Get browser path and args based on OS and browser type + browser_path = self._get_browser_path() + args = self._get_browser_args() + + # Start browser process + try: + self.browser_process = subprocess.Popen( + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + await asyncio.sleep(2) # Give browser time to start + return f"http://localhost:{self.debugging_port}" + except Exception as e: + await self.cleanup() + raise Exception(f"Failed to start browser: {e}") + + def _get_browser_path(self) -> str: + """Returns the browser executable path based on OS and browser type""" + if sys.platform == "darwin": # macOS + paths = { + "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", + "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" + } + elif sys.platform == "win32": # Windows + paths = { + "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", + "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", + "webkit": None # WebKit not supported on Windows + } + else: # Linux + paths = { + "chromium": "google-chrome", + "firefox": "firefox", + "webkit": None # WebKit not supported on Linux + } + + return paths.get(self.browser_type) + + def _get_browser_args(self) -> List[str]: + """Returns browser-specific command line arguments""" + base_args = [self._get_browser_path()] + + if self.browser_type == "chromium": + args = [ + f"--remote-debugging-port={self.debugging_port}", + f"--user-data-dir={self.user_data_dir}", + ] + if self.headless: + args.append("--headless=new") + elif self.browser_type == "firefox": + args = [ + "--remote-debugging-port", str(self.debugging_port), + "--profile", self.user_data_dir, + ] + if self.headless: + args.append("--headless") + else: + raise NotImplementedError(f"Browser type {self.browser_type} not supported") + + return base_args + args + + async def cleanup(self): + """Cleanup browser process and temporary directory""" + if self.browser_process: + try: + self.browser_process.terminate() + await asyncio.sleep(1) + if self.browser_process.poll() is None: + self.browser_process.kill() + except Exception as e: + print(f"Error terminating browser: {e}") + + if self.temp_dir and os.path.exists(self.temp_dir): + try: + shutil.rmtree(self.temp_dir) + except Exception as e: + print(f"Error removing temporary directory: {e}") + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + + class Config: + arbitrary_types_allowed = True + +class AsyncCrawlerStrategy(ABC): + @abstractmethod + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + pass + + @abstractmethod + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + pass + + @abstractmethod + async def take_screenshot(self, **kwargs) -> str: + pass + + @abstractmethod + def update_user_agent(self, user_agent: str): + pass + + @abstractmethod + def set_hook(self, hook_type: str, hook: Callable): + pass + +class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + def __init__(self, use_cached_html=False, js_code=None, **kwargs): + self.use_cached_html = use_cached_html + self.user_agent = kwargs.get( + "user_agent", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + self.proxy = kwargs.get("proxy") + self.proxy_config = kwargs.get("proxy_config") + self.headless = kwargs.get("headless", True) + self.browser_type = kwargs.get("browser_type", "chromium") + self.headers = kwargs.get("headers", {}) + self.sessions = {} + self.session_ttl = 1800 + self.js_code = js_code + self.verbose = kwargs.get("verbose", False) + self.playwright = None + self.browser = None + self.sleep_on_close = kwargs.get("sleep_on_close", False) + self.use_managed_browser = kwargs.get("use_managed_browser", False) + self.user_data_dir = kwargs.get("user_data_dir", None) + self.use_persistent_context = kwargs.get("use_persistent_context", False) + self.chrome_channel = kwargs.get("chrome_channel", "chrome") + self.managed_browser = None + self.default_context = None + self.hooks = { + 'on_browser_created': None, + 'on_user_agent_updated': None, + 'on_execution_started': None, + 'before_goto': None, + 'after_goto': None, + 'before_return_html': None, + 'before_retrieve_html': None + } + self.extra_args = kwargs.get("extra_args", []) + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def start(self): + if self.playwright is None: + self.playwright = await async_playwright().start() + if self.browser is None: + if self.use_managed_browser: + # Use managed browser approach + self.managed_browser = ManagedBrowser( + browser_type=self.browser_type, + user_data_dir=self.user_data_dir, + headless=self.headless + ) + cdp_url = await self.managed_browser.start() + self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) + + # Get the default context that maintains the user profile + contexts = self.browser.contexts + if contexts: + self.default_context = contexts[0] + else: + # If no default context exists, create one + self.default_context = await self.browser.new_context( + viewport={"width": 1920, "height": 1080} + ) + + # Set up the default context + if self.default_context: + await self.default_context.set_extra_http_headers(self.headers) + + if self.user_agent: + await self.default_context.set_extra_http_headers({ + "User-Agent": self.user_agent + }) + else: + browser_args = { + "headless": self.headless, + "args": [ + "--disable-gpu", + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + # "--disable-http2", + # "--headless=new", # Use the new headless mode + ] + } + + # Add extra args if provided + if self.extra_args: + browser_args["args"].extend(self.extra_args) + + # Add proxy settings if a proxy is specified + if self.proxy: + proxy_settings = ProxySettings(server=self.proxy) + browser_args["proxy"] = proxy_settings + elif self.proxy_config: + proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) + browser_args["proxy"] = proxy_settings + + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + # Update the headless configuration + if self.headless: + # Use the new headless mode explicitly + browser_args["args"].append("--headless=new") + + await self.execute_hook('on_browser_created', self.browser) + + async def close(self): + if self.sleep_on_close: + await asyncio.sleep(0.5) + + # Close all active sessions + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + + if self.browser: + await self.browser.close() + self.browser = None + + if self.managed_browser: + await self.managed_browser.cleanup() + self.managed_browser = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + + def __del__(self): + if self.browser or self.playwright: + asyncio.get_event_loop().run_until_complete(self.close()) + + def set_hook(self, hook_type: str, hook: Callable): + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def execute_hook(self, hook_type: str, *args): + hook = self.hooks.get(hook_type) + if hook: + if asyncio.iscoroutinefunction(hook): + return await hook(*args) + else: + return hook(*args) + return args[0] if args else None + + def update_user_agent(self, user_agent: str): + self.user_agent = user_agent + + def set_custom_headers(self, headers: Dict[str, str]): + self.headers = headers + + async def kill_session(self, session_id: str): + if session_id in self.sessions: + context, page, _ = self.sessions[session_id] + await page.close() + if not self.use_managed_browser: + await context.close() + del self.sessions[session_id] + + def _cleanup_expired_sessions(self): + current_time = time.time() + expired_sessions = [ + sid for sid, (_, _, last_used) in self.sessions.items() + if current_time - last_used > self.session_ttl + ] + for sid in expired_sessions: + asyncio.create_task(self.kill_session(sid)) + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + wait_for = wait_for.strip() + + if wait_for.startswith('js:'): + # Explicitly specified JavaScript + js_code = wait_for[3:].strip() + return await self.csp_compliant_wait(page, js_code, timeout) + elif wait_for.startswith('css:'): + # Explicitly specified CSS selector + css_selector = wait_for[4:].strip() + try: + await page.wait_for_selector(css_selector, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") + else: + raise ValueError(f"Invalid CSS selector: '{css_selector}'") + else: + # Auto-detect based on content + if wait_for.startswith('()') or wait_for.startswith('function'): + # It's likely a JavaScript function + return await self.csp_compliant_wait(page, wait_for, timeout) + else: + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if 'Timeout' in str(e): + raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") + else: + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback + try: + return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) + except Error: + raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'.") + + async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): + wrapper_js = f""" + async () => {{ + const userFunction = {user_wait_function}; + const startTime = Date.now(); + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + throw new Error('Timeout waiting for condition'); + }} + await new Promise(resolve => setTimeout(resolve, 100)); + }} + }} + """ + + try: + await page.evaluate(wrapper_js) + except TimeoutError: + raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") + except Exception as e: + raise RuntimeError(f"Error in wait condition: {str(e)}") + + async def process_iframes(self, page): + # Find all iframes + iframes = await page.query_selector_all('iframe') + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate('() => document.body.innerHTML') + + # Generate a unique class name for this iframe + class_name = f'extracted-iframe-content-{i}' + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace('`', '\\`') + await page.evaluate(f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """) + else: + print(f"Warning: Could not access content frame for iframe {i}") + except Exception as e: + print(f"Error processing iframe {i}: {str(e)}") + + # Return the page object + return page + + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + response_headers = {} + status_code = None + + self._cleanup_expired_sessions() + session_id = kwargs.get("session_id") + + # Handle page creation differently for managed browser + if self.use_managed_browser: + if session_id: + # Reuse existing session if available + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not page: + # Create new page in default context if session doesn't exist + page = await self.default_context.new_page() + self.sessions[session_id] = (self.default_context, page, time.time()) + else: + # Create new page in default context for non-session requests + page = await self.default_context.new_page() + else: + if session_id: + context, page, _ = self.sessions.get(session_id, (None, None, None)) + if not context: + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=True, + java_script_enabled=True + ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1920, "height": 1080}, + proxy={"server": self.proxy} if self.proxy else None + ) + await context.set_extra_http_headers(self.headers) + + if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Inject scripts to override navigator properties + await context.add_init_script(""" + // Pass the Permissions Test. + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + window.navigator.chrome = { + runtime: {}, + // Add other properties if necessary + }; + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5], + }); + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'], + }); + Object.defineProperty(document, 'hidden', { + get: () => false + }); + Object.defineProperty(document, 'visibilityState', { + get: () => 'visible' + }); + """) + + page = await context.new_page() + if kwargs.get("magic", False): + await stealth_async(page, stealth_config) + + # Add console message and error logging + if kwargs.get("log_console", False): + page.on("console", lambda msg: print(f"Console: {msg.text}")) + page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) + + try: + if self.verbose: + print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") + + if self.use_cached_html: + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + if os.path.exists(cache_file_path): + html = "" + with open(cache_file_path, "r") as f: + html = f.read() + # retrieve response headers and status code from cache + with open(cache_file_path + ".meta", "r") as f: + meta = json.load(f) + response_headers = meta.get("response_headers", {}) + status_code = meta.get("status_code") + response = AsyncCrawlResponse( + html=html, response_headers=response_headers, status_code=status_code + ) + return response + + if not kwargs.get("js_only", False): + await self.execute_hook('before_goto', page) + + # response = await page.goto( + # url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) + # ) + + # Add retry logic for HTTP2 errors + max_retries = kwargs.get("max_retries", 3) + current_try = 0 + + while current_try < max_retries: + try: + response = await page.goto( + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "networkidle"), + timeout=kwargs.get("page_timeout", 60000) + ) + break + except Exception as e: + current_try += 1 + if "ERR_HTTP2_PROTOCOL_ERROR" in str(e): + if current_try < max_retries: + # Add exponential backoff + await asyncio.sleep(2 ** current_try) + # Try with different protocol + if 'args' not in kwargs: + kwargs['args'] = [] + kwargs['args'].extend(['--disable-http2']) + continue + if current_try == max_retries: + raise + + # response = await page.goto("about:blank") + # await page.evaluate(f"window.location.href = '{url}'") + + await self.execute_hook('after_goto', page) + + # Get status code and headers + status_code = response.status + response_headers = response.headers + else: + status_code = 200 + response_headers = {} + + # Replace the current wait_for_selector line with this more robust check: + try: + # First wait for body to exist, regardless of visibility + await page.wait_for_selector('body', state='attached', timeout=30000) + + # Then wait for it to become visible by checking CSS + await page.wait_for_function(""" + () => { + const body = document.body; + const style = window.getComputedStyle(body); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + } + """, timeout=30000) + + except Error as e: + # If waiting fails, let's try to diagnose the issue + visibility_info = await page.evaluate(""" + () => { + const body = document.body; + const style = window.getComputedStyle(body); + return { + display: style.display, + visibility: style.visibility, + opacity: style.opacity, + hasContent: body.innerHTML.length, + classList: Array.from(body.classList) + } + } + """) + + if self.verbose: + print(f"Body visibility debug info: {visibility_info}") + + # Even if body is hidden, we might still want to proceed + if kwargs.get('ignore_body_visibility', True): + if self.verbose: + print("Proceeding despite hidden body...") + pass + else: + raise Error(f"Body element is hidden: {visibility_info}") + + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + + js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) + if js_code: + if isinstance(js_code, str): + await page.evaluate(js_code) + elif isinstance(js_code, list): + for js in js_code: + await page.evaluate(js) + + await page.wait_for_load_state('networkidle') + # Check for on execution event + await self.execute_hook('on_execution_started', page) + + if kwargs.get("simulate_user", False) or kwargs.get("magic", False): + # Simulate user interactions + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press('ArrowDown') + + # Handle the wait_for parameter + wait_for = kwargs.get("wait_for") + if wait_for: + try: + await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") + + # Update image dimensions + update_image_dimensions_js = """ + () => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes('placeholder') || img.src.includes('icon')) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll('img')).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute('width', img.naturalWidth); + img.setAttribute('height', img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach(img => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + // setTimeout(() => resolve(), 5000); + resolve(); + }); + } + """ + await page.evaluate(update_image_dimensions_js) + + # Wait a bit for any onload events to complete + await page.wait_for_timeout(100) + + # Process iframes + if kwargs.get("process_iframes", False): + page = await self.process_iframes(page) + + await self.execute_hook('before_retrieve_html', page) + # Check if delay_before_return_html is set then wait for that time + delay_before_return_html = kwargs.get("delay_before_return_html") + if delay_before_return_html: + await asyncio.sleep(delay_before_return_html) + + # Check for remove_overlay_elements parameter + if kwargs.get("remove_overlay_elements", False): + await self.remove_overlay_elements(page) + + html = await page.content() + await self.execute_hook('before_return_html', page, html) + + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + # Check we have screenshot_wait_for parameter, if we have simply wait for that time + screenshot_wait_for = kwargs.get("screenshot_wait_for") + if screenshot_wait_for: + await asyncio.sleep(screenshot_wait_for) + screenshot_data = await self.take_screenshot(page) + + if self.verbose: + print(f"[LOG] ✅ Crawled {url} successfully!") + + if self.use_cached_html: + cache_file_path = os.path.join( + Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + ) + with open(cache_file_path, "w", encoding="utf-8") as f: + f.write(html) + # store response headers and status code in cache + with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: + json.dump({ + "response_headers": response_headers, + "status_code": status_code + }, f) + + async def get_delayed_content(delay: float = 5.0) -> str: + if self.verbose: + print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + await asyncio.sleep(delay) + return await page.content() + + response = AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=get_delayed_content + ) + return response + except Error as e: + raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") + # finally: + # if not session_id: + # await page.close() + # await context.close() + + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.crawl(url, **kwargs) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def remove_overlay_elements(self, page: Page) -> None: + """ + Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. + + Args: + page (Page): The Playwright page instance + """ + remove_overlays_js = """ + async () => { + // Function to check if element is visible + const isVisible = (elem) => { + const style = window.getComputedStyle(elem); + return style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + }; + + // Common selectors for popups and overlays + const commonSelectors = [ + // Close buttons first + 'button[class*="close" i]', 'button[class*="dismiss" i]', + 'button[aria-label*="close" i]', 'button[title*="close" i]', + 'a[class*="close" i]', 'span[class*="close" i]', + + // Cookie notices + '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', + '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', + + // Newsletter/subscription dialogs + '[class*="newsletter" i]', '[class*="subscribe" i]', + + // Generic popups/modals + '[class*="popup" i]', '[class*="modal" i]', + '[class*="overlay" i]', '[class*="dialog" i]', + '[role="dialog"]', '[role="alertdialog"]' + ]; + + // Try to click close buttons first + for (const selector of commonSelectors.slice(0, 6)) { + const closeButtons = document.querySelectorAll(selector); + for (const button of closeButtons) { + if (isVisible(button)) { + try { + button.click(); + await new Promise(resolve => setTimeout(resolve, 100)); + } catch (e) { + console.log('Error clicking button:', e); + } + } + } + } + + // Remove remaining overlay elements + const removeOverlays = () => { + // Find elements with high z-index + const allElements = document.querySelectorAll('*'); + for (const elem of allElements) { + const style = window.getComputedStyle(elem); + const zIndex = parseInt(style.zIndex); + const position = style.position; + + if ( + isVisible(elem) && + (zIndex > 999 || position === 'fixed' || position === 'absolute') && + ( + elem.offsetWidth > window.innerWidth * 0.5 || + elem.offsetHeight > window.innerHeight * 0.5 || + style.backgroundColor.includes('rgba') || + parseFloat(style.opacity) < 1 + ) + ) { + elem.remove(); + } + } + + // Remove elements matching common selectors + for (const selector of commonSelectors) { + const elements = document.querySelectorAll(selector); + elements.forEach(elem => { + if (isVisible(elem)) { + elem.remove(); + } + }); + } + }; + + // Remove overlay elements + removeOverlays(); + + // Remove any fixed/sticky position elements at the top/bottom + const removeFixedElements = () => { + const elements = document.querySelectorAll('*'); + elements.forEach(elem => { + const style = window.getComputedStyle(elem); + if ( + (style.position === 'fixed' || style.position === 'sticky') && + isVisible(elem) + ) { + elem.remove(); + } + }); + }; + + removeFixedElements(); + + // Remove empty block elements as: div, p, span, etc. + const removeEmptyBlockElements = () => { + const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); + blockElements.forEach(elem => { + if (elem.innerText.trim() === '') { + elem.remove(); + } + }); + }; + + // Remove margin-right and padding-right from body (often added by modal scripts) + document.body.style.marginRight = '0px'; + document.body.style.paddingRight = '0px'; + document.body.style.overflow = 'auto'; + + // Wait a bit for any animations to complete + await new Promise(resolve => setTimeout(resolve, 100)); + } + """ + + try: + await page.evaluate(remove_overlays_js) + await page.wait_for_timeout(500) # Wait for any animations to complete + except Exception as e: + if self.verbose: + print(f"Warning: Failed to remove overlay elements: {str(e)}") + + async def take_screenshot(self, page: Page) -> str: + try: + # The page is already loaded, just take the screenshot + screenshot = await page.screenshot(full_page=True) + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + print(error_message) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') + finally: + await page.close() + diff --git a/main.py b/main.py index 853cd0b7..a5da029c 100644 --- a/main.py +++ b/main.py @@ -321,7 +321,12 @@ app.add_middleware( # Mount the pages directory as a static directory app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") -app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") + +# Check if site directory exists +if os.path.exists(__location__ + "/site"): + # Mount the site directory as a static directory + app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") + site_templates = Jinja2Templates(directory=__location__ + "/site") templates = Jinja2Templates(directory=__location__ + "/pages") @@ -337,7 +342,10 @@ async def shutdown_event(): @app.get("/") def read_root(): - return RedirectResponse(url="/mkdocs") + if os.path.exists(__location__ + "/site"): + return RedirectResponse(url="/mkdocs") + # Return a json response + return {"message": "Crawl4AI API service is running"} @app.post("/crawl") diff --git a/requirements.txt b/requirements.txt index 9a942958..e83643b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ playwright>=1.47,<1.48 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 -playwright_stealth~=1.0 +tf-playwright-stealth~=1.0 From bf91adf3f84ade380b819f55c444ed87c80c032b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 15:37:16 +0800 Subject: [PATCH 02/50] fix: Resolve unexpected BrowserContext closure during crawl in Docker - Removed __del__ method in AsyncPlaywrightCrawlerStrategy to ensure reliable browser lifecycle management by using explicit context managers. - Added process monitoring in ManagedBrowser to detect and log unexpected terminations of the browser subprocess. - Updated Docker configuration to expose port 9222 for remote debugging and allocate extra shared memory to prevent browser crashes. - Improved error handling and resource cleanup for browser instances, particularly in Docker environments. Resolves Issue #256 --- .gitignore | 1 + Dockerfile | 7 ++++++- README.md | 5 ++++- crawl4ai/__init__.py | 4 ++-- crawl4ai/async_crawler_strategy.py | 22 +++++++++++++++++++--- crawl4ai/async_webcrawler.py | 14 +++++++------- crawl4ai/config.py | 2 ++ crawl4ai/web_crawler.py | 17 ++++++++++++++++- 8 files changed, 57 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 4c3e151e..aca02959 100644 --- a/.gitignore +++ b/.gitignore @@ -199,6 +199,7 @@ test_env/ **/.DS_Store todo.md +todo_executor.md git_changes.py git_changes.md pypi_build.sh diff --git a/Dockerfile b/Dockerfile index 9a921d03..125fb9b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -115,7 +115,12 @@ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ CMD curl -f http://localhost:8000/health || exit 1 # Expose port -EXPOSE 8000 +EXPOSE 8000 11235 9222 8080 + +# Optional: Increase shared memory size to prevent browser crashes +# when loading heavy pages +RUN mkdir /dev/shm +VOLUME /dev/shm # Start the FastAPI server CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"] \ No newline at end of file diff --git a/README.md b/README.md index e1a64aa1..d250f936 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper +# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper unclecode%2Fcrawl4ai | Trendshift @@ -127,6 +127,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version # Run the container docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version + +# In case to allocate more shared memory for the container +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` #### Option 2: Build from Repository diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0c6a2db4..1bcc491c 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -26,5 +26,5 @@ if is_sync_version_installed(): print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.") else: WebCrawler = None - import warnings - print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file + # import warnings + # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 896a0644..57288b59 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -64,12 +64,27 @@ class ManagedBrowser: stdout=subprocess.PIPE, stderr=subprocess.PIPE ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start return f"http://localhost:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") + async def _monitor_browser_process(self): + """Monitor the browser process for unexpected termination.""" + if self.browser_process: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + if self.browser_process.poll() is not None: + print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}") + print(f"STDOUT: {stdout.decode()}") + print(f"STDERR: {stderr.decode()}") + await self.cleanup() + def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS @@ -330,9 +345,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.playwright.stop() self.playwright = None - def __del__(self): - if self.browser or self.playwright: - asyncio.get_event_loop().run_until_complete(self.close()) + # Issue #256: Remove __del__ method to avoid potential issues with async cleanup + # def __del__(self): + # if self.browser or self.playwright: + # asyncio.get_event_loop().run_until_complete(self.close()) def set_hook(self, hook_type: str, hook: Callable): if hook_type in self.hooks: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ceb9ad28..f580776b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -47,17 +47,17 @@ class AsyncWebCrawler: async def awarmup(self): # Print a message for crawl4ai and its version - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") if self.verbose: + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") print("[LOG] 🌤️ Warming up the AsyncWebCrawler") # await async_db_manager.ainit_db() await async_db_manager.initialize() - await self.arun( - url="https://google.com/", - word_count_threshold=5, - bypass_cache=False, - verbose=False, - ) + # await self.arun( + # url="https://google.com/", + # word_count_threshold=5, + # bypass_cache=False, + # verbose=False, + # ) self.ready = True if self.verbose: print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") diff --git a/crawl4ai/config.py b/crawl4ai/config.py index a07ca977..16638b6d 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -51,3 +51,5 @@ SOCIAL_MEDIA_DOMAINS = [ # If image format is in jpg, png or webp # If image is in the first half of the total images extracted from the page IMAGE_SCORE_THRESHOLD = 2 + +MAX_METRICS_HISTORY = 1000 \ No newline at end of file diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 20e9b04e..95af6c7a 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,6 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor +from .content_scrapping_strategy import WebScrappingStrategy from .config import * import warnings import json @@ -181,7 +182,21 @@ class WebCrawler: # Extract content from HTML try: t1 = time.time() - result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) + scrapping_strategy = WebScrappingStrategy() + extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]} + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=word_count_threshold, + css_selector=css_selector, + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get( + "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + ), + **extra_params, + ) + + # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) if verbose: print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds") From 61b93ebf362205e2c96c5c2d74bc1b880ca59f51 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 15:38:30 +0800 Subject: [PATCH 03/50] Update change log --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 583c7807..ff52e10e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ # Changelog -# CHANGELOG +## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix +- Fixed: Browser context unexpectedly closing in Docker environment during crawl operations. +- Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers. +- Added: Monitoring for ManagedBrowser subprocess to detect and log unexpected terminations. +- Updated: Dockerfile configurations to expose debugging port (9222) and allocate additional shared memory for improved browser stability. +- Improved: Error handling and resource cleanup processes for browser lifecycle management within the Docker environment. ## [v0.3.73] - 2024-11-05 From c38ac29edbcebcb2f3672145424e7af3193caa6e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 19:40:40 +0800 Subject: [PATCH 04/50] perf(crawler): major performance improvements & raw HTML support - Switch to lxml parser (~4x speedup) - Add raw HTML & local file crawling support - Fix cache headers & async cleanup - Add browser process monitoring - Optimize BeautifulSoup operations - Pre-compile regex patterns Breaking: Raw HTML handling requires new URL prefixes Fixes: #256, #253 --- CHANGELOG.md | 33 +- crawl4ai/async_crawler_strategy.py | 114 +- crawl4ai/async_database.py | 67 +- crawl4ai/async_webcrawler.py | 127 +- crawl4ai/content_cleaning_strategy.py | 10 +- crawl4ai/content_scrapping_strategy.py | 102 +- crawl4ai/utils.py | 50 +- crawl4ai/web_crawler.py | 4 +- docs/md_v2/basic/prefix-based-input.md | 235 ++ tests/async/sample_wikipedia.html | 2179 ++++++++++++++++++ tests/async/test_content_scraper_strategy.py | 162 ++ 11 files changed, 2953 insertions(+), 130 deletions(-) create mode 100644 docs/md_v2/basic/prefix-based-input.md create mode 100644 tests/async/sample_wikipedia.html create mode 100644 tests/async/test_content_scraper_strategy.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ff52e10e..33d09184 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ # Changelog +# Changelog - November 13, 2024 + +### Added +- Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://') +- Browser process monitoring for managed browser instances +- Screenshot capability for raw HTML and local file content +- Response headers storage in cache database +- New `fit_markdown` flag for optional markdown generation + +### Changed +- Switched HTML parser from 'html.parser' to 'lxml' for ~4x performance improvement +- Optimized BeautifulSoup text conversion and element selection +- Pre-compiled regular expressions for better performance +- Improved metadata extraction efficiency +- Response headers now stored alongside HTML in cache + +### Removed +- `__del__` method from AsyncPlaywrightCrawlerStrategy to prevent async cleanup issues + +### Fixed +- Issue #256: Added support for crawling raw HTML content +- Issue #253: Implemented file:// protocol handling +- Missing response headers in cached results +- Memory leaks from improper async cleanup + ## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix - Fixed: Browser context unexpectedly closing in Docker environment during crawl operations. - Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers. @@ -185,7 +210,7 @@ This commit introduces several key enhancements, including improved error handli ## [v0.3.72] - 2024-10-20 ### Fixed -- Added support for parsing Base64 encoded images in WebScrappingStrategy +- Added support for parsing Base64 encoded images in WebScrapingStrategy ### Added - Forked and integrated a customized version of the html2text library for more control over Markdown generation @@ -208,7 +233,7 @@ This commit introduces several key enhancements, including improved error handli ### Developer Notes - The customized html2text library is now located within the crawl4ai package - New configuration options are available in the `config.py` file for external content handling -- The `WebScrappingStrategy` class has been updated to accommodate new external content exclusion options +- The `WebScrapingStrategy` class has been updated to accommodate new external content exclusion options ## [v0.3.71] - 2024-10-19 @@ -285,7 +310,7 @@ These updates aim to provide more flexibility in text processing, improve perfor ### Improvements 1. **Better Error Handling**: - - Enhanced error reporting in WebScrappingStrategy with detailed error messages and suggestions. + - Enhanced error reporting in WebScrapingStrategy with detailed error messages and suggestions. - Added console message and error logging for better debugging. 2. **Image Processing Enhancements**: @@ -350,7 +375,7 @@ These updates aim to provide more flexibility in text processing, improve perfor - Allows for more customized setups. ### 2. Image Processing Optimization -- Enhanced image handling in WebScrappingStrategy. +- Enhanced image handling in WebScrapingStrategy. - Added filtering for small, invisible, or irrelevant images. - Improved image scoring system for better content relevance. - Implemented JavaScript-based image dimension updating for more accurate representation. diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 57288b59..baa06e47 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -84,7 +84,7 @@ class ManagedBrowser: print(f"STDOUT: {stdout.decode()}") print(f"STDERR: {stderr.decode()}") await self.cleanup() - + def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS @@ -493,6 +493,75 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return page async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Crawls a given URL or processes raw HTML/local file content based on the URL prefix. + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + **kwargs: Additional parameters: + - 'screenshot' (bool): Whether to take a screenshot. + - ... [other existing parameters] + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ + response_headers = {} + status_code = 200 # Default to 200 for local/raw HTML + screenshot_requested = kwargs.get('screenshot', False) + screenshot_data = None + + if url.startswith(('http://', 'https://')): + # Proceed with standard web crawling + return await self._crawl_web(url, **kwargs) + + elif url.startswith('file://'): + # Process local file + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, 'r', encoding='utf-8') as f: + html = f.read() + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + + elif url.startswith('raw:'): + # Process raw HTML content + raw_html = url[4:] # Remove 'raw:' prefix + html = raw_html + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + else: + raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") + + + async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Existing web crawling logic remains unchanged. + + Args: + url (str): The web URL to crawl. + **kwargs: Additional parameters. + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ response_headers = {} status_code = None @@ -792,7 +861,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.verbose: print(f"[LOG] ✅ Crawled {url} successfully!") - + if self.use_cached_html: cache_file_path = os.path.join( Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() @@ -972,6 +1041,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): print(f"Warning: Failed to remove overlay elements: {str(e)}") async def take_screenshot(self, page: Page) -> str: + """ + Takes a screenshot of the current page. + + Args: + page (Page): The Playwright page instance + + Returns: + str: Base64-encoded screenshot image + """ try: # The page is already loaded, just take the screenshot screenshot = await page.screenshot(full_page=True) @@ -991,4 +1069,36 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(buffered.getvalue()).decode('utf-8') finally: await page.close() + + async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: + """ + Generates a screenshot from raw HTML content. + + Args: + html (str): The HTML content to render and capture. + + Returns: + Optional[str]: Base64-encoded screenshot image or an error image if failed. + """ + try: + if not self.browser: + await self.start() + page = await self.browser.new_page() + await page.set_content(html, wait_until='networkidle') + screenshot = await page.screenshot(full_page=True) + await page.close() + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + print(error_message) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 78931d28..273ca6c9 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -5,6 +5,7 @@ import asyncio from typing import Optional, Tuple, Dict from contextlib import asynccontextmanager import logging +import json # Added for serialization/deserialization # Set up logging logging.basicConfig(level=logging.INFO) @@ -89,7 +90,8 @@ class AsyncDatabaseManager: media TEXT DEFAULT "{}", links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", - screenshot TEXT DEFAULT "" + screenshot TEXT DEFAULT "", + response_headers TEXT DEFAULT "{}" -- New column added ) ''') @@ -105,26 +107,51 @@ class AsyncDatabaseManager: column_names = await self.execute_with_retry(_check_columns) - for column in ['media', 'links', 'metadata', 'screenshot']: + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers'] + + for column in new_columns: if column not in column_names: await self.aalter_db_add_column(column) async def aalter_db_add_column(self, new_column: str): """Add new column to the database""" async def _alter(db): - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') logger.info(f"Added column '{new_column}' to the database.") await self.execute_with_retry(_alter) - async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: + async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: """Retrieve cached URL data""" async def _get(db): async with db.execute( - 'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', + ''' + SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + FROM crawled_data WHERE url = ? + ''', (url,) ) as cursor: - return await cursor.fetchone() + row = await cursor.fetchone() + if row: + # Deserialize JSON fields + return ( + row[0], # url + row[1], # html + row[2], # cleaned_html + row[3], # markdown + row[4], # extracted_content + row[5], # success + json.loads(row[6] or '{}'), # media + json.loads(row[7] or '{}'), # links + json.loads(row[8] or '{}'), # metadata + row[9], # screenshot + json.loads(row[10] or '{}') # response_headers + ) + return None try: return await self.execute_with_retry(_get) @@ -132,12 +159,27 @@ class AsyncDatabaseManager: logger.error(f"Error retrieving cached URL: {e}") return None - async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""): + async def acache_url( + self, + url: str, + html: str, + cleaned_html: str, + markdown: str, + extracted_content: str, + success: bool, + media: str = "{}", + links: str = "{}", + metadata: str = "{}", + screenshot: str = "", + response_headers: str = "{}" # New parameter added + ): """Cache URL data with retry logic""" async def _cache(db): await db.execute(''' - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO crawled_data ( + url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, @@ -147,8 +189,9 @@ class AsyncDatabaseManager: media = excluded.media, links = excluded.links, metadata = excluded.metadata, - screenshot = excluded.screenshot - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)) + screenshot = excluded.screenshot, + response_headers = excluded.response_headers -- Update response_headers + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers)) try: await self.execute_with_retry(_cache) @@ -189,4 +232,4 @@ class AsyncDatabaseManager: logger.error(f"Error flushing database: {e}") # Create a singleton instance -async_db_manager = AsyncDatabaseManager() \ No newline at end of file +async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index f580776b..9d0340dc 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -9,7 +9,7 @@ from .async_database import async_db_manager from .chunking_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse -from .content_scrapping_strategy import WebScrappingStrategy +from .content_scrapping_strategy import WebScrapingStrategy from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD from .utils import ( sanitize_input_encode, @@ -47,17 +47,17 @@ class AsyncWebCrawler: async def awarmup(self): # Print a message for crawl4ai and its version + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") if self.verbose: - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") print("[LOG] 🌤️ Warming up the AsyncWebCrawler") # await async_db_manager.ainit_db() await async_db_manager.initialize() - # await self.arun( - # url="https://google.com/", - # word_count_threshold=5, - # bypass_cache=False, - # verbose=False, - # ) + await self.arun( + url="https://google.com/", + word_count_threshold=5, + bypass_cache=False, + verbose=False, + ) self.ready = True if self.verbose: print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") @@ -75,6 +75,19 @@ class AsyncWebCrawler: verbose=True, **kwargs, ) -> CrawlResult: + """ + Runs the crawler for a single source: URL (web, local file, or raw HTML). + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + CrawlResult: The result of the crawling and processing. + """ try: extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose @@ -89,8 +102,13 @@ class AsyncWebCrawler: cached = None screenshot_data = None extracted_content = None - if not bypass_cache and not self.always_by_pass_cache: + + is_web_url = url.startswith(('http://', 'https://')) + if is_web_url and not bypass_cache and not self.always_by_pass_cache: cached = await async_db_manager.aget_cached_url(url) + + # if not bypass_cache and not self.always_by_pass_cache: + # cached = await async_db_manager.aget_cached_url(url) if kwargs.get("warmup", True) and not self.ready: return None @@ -117,25 +135,32 @@ class AsyncWebCrawler: ) crawl_result = await self.aprocess_html( - url, - html, - extracted_content, - word_count_threshold, - extraction_strategy, - chunking_strategy, - css_selector, - screenshot_data, - verbose, - bool(cached), + url=url, + html=html, + extracted_content=extracted_content, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + css_selector=css_selector, + screenshot=screenshot_data, + verbose=verbose, + is_cached=bool(cached), async_response=async_response, bypass_cache=bypass_cache, **kwargs, ) - crawl_result.status_code = async_response.status_code if async_response else 200 - crawl_result.response_headers = async_response.response_headers if async_response else {} + + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached[10] + crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) return crawl_result + except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) @@ -155,22 +180,40 @@ class AsyncWebCrawler: verbose=True, **kwargs, ) -> List[CrawlResult]: - tasks = [ - self.arun( - url, - word_count_threshold, - extraction_strategy, - chunking_strategy, - bypass_cache, - css_selector, - screenshot, - user_agent, - verbose, - **kwargs - ) - for url in urls - ] - return await asyncio.gather(*tasks) + """ + Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). + + Args: + urls (List[str]): A list of URLs with supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + List[CrawlResult]: The results of the crawling and processing. + """ + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.arun( + url, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + bypass_cache=bypass_cache, + css_selector=css_selector, + screenshot=screenshot, + user_agent=user_agent, + verbose=verbose, + **kwargs, + ) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] async def aprocess_html( self, @@ -184,13 +227,14 @@ class AsyncWebCrawler: screenshot: str, verbose: bool, is_cached: bool, + async_response: Optional[AsyncCrawlResponse], **kwargs, ) -> CrawlResult: t = time.time() # Extract content from HTML try: t1 = time.time() - scrapping_strategy = WebScrappingStrategy() + scrapping_strategy = WebScrapingStrategy() # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( url, @@ -245,6 +289,12 @@ class AsyncWebCrawler: ) screenshot = None if not screenshot else screenshot + + response_headers = "{}" # Default value + if async_response: + # Serialize response_headers dict to JSON string + response_headers = json.dumps(async_response.response_headers, ensure_ascii=False) + if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: await async_db_manager.acache_url( @@ -258,6 +308,7 @@ class AsyncWebCrawler: json.dumps(links), json.dumps(metadata), screenshot=screenshot, + response_headers=response_headers, ) return CrawlResult( diff --git a/crawl4ai/content_cleaning_strategy.py b/crawl4ai/content_cleaning_strategy.py index 2f052f76..b8a5053d 100644 --- a/crawl4ai/content_cleaning_strategy.py +++ b/crawl4ai/content_cleaning_strategy.py @@ -15,7 +15,7 @@ class ContentCleaningStrategy: self.link_density_threshold = 0.2 self.max_dom_depth = 10 # To prevent excessive DOM traversal - def clean(self, clean_html: str) -> str: + def clean(self, clean_html: str, soup = None) -> str: """ Main function that takes cleaned HTML and returns super cleaned HTML. @@ -28,18 +28,20 @@ class ContentCleaningStrategy: try: if not clean_html or not isinstance(clean_html, str): return '' - soup = BeautifulSoup(clean_html, 'html.parser') + if not soup: + # soup = BeautifulSoup(clean_html, 'html.parser') + soup = BeautifulSoup(clean_html, 'lxml') main_content = self.extract_main_content(soup) if main_content: super_clean_element = self.clean_element(main_content) - return str(super_clean_element) + return super_clean_element.encode_contents().decode('utf-8') else: return '' except Exception: # Handle exceptions silently or log them as needed return '' - def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]: + def extract_main_content(self, soup) -> Optional[Tag]: """ Identifies and extracts the main content element from the HTML. diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index caed7319..a2dbbd96 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -1,3 +1,4 @@ +import re # Point 1: Pre-Compile Regular Expressions from abc import ABC, abstractmethod from typing import Dict, Any from bs4 import BeautifulSoup @@ -105,7 +106,39 @@ class CustomHTML2Text(HTML2Text): return super().handle_data(data, entity_char) -class ContentScrappingStrategy(ABC): +# Pre-compile regular expressions for Open Graph and Twitter metadata +OG_REGEX = re.compile(r'^og:') +TWITTER_REGEX = re.compile(r'^twitter:') +DIMENSION_REGEX = re.compile(r"(\d+)(\D*)") + +# Function to parse image height/width value and units +def parse_dimension(dimension): + if dimension: + # match = re.match(r"(\d+)(\D*)", dimension) + match = DIMENSION_REGEX.match(dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None + +# Fetch image file metadata to extract size and extension +def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) + else: + print(f"Failed to retrieve file size for {img_url}") + return None + except InvalidSchema as e: + return None + finally: + return + +class ContentScrapingStrategy(ABC): @abstractmethod def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: pass @@ -114,7 +147,7 @@ class ContentScrappingStrategy(ABC): async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: pass -class WebScrappingStrategy(ContentScrappingStrategy): +class WebScrapingStrategy(ContentScrapingStrategy): def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) @@ -126,9 +159,16 @@ class WebScrappingStrategy(ContentScrappingStrategy): if not html: return None - soup = BeautifulSoup(html, 'html.parser') + # soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, 'lxml') body = soup.body + try: + meta = extract_metadata("", soup) + except Exception as e: + print('Error extracting metadata:', str(e)) + meta = {} + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) @@ -187,31 +227,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): #Score an image for it's usefulness def score_image_for_usefulness(img, base_url, index, images_count): - # Function to parse image height/width value and units - def parse_dimension(dimension): - if dimension: - match = re.match(r"(\d+)(\D*)", dimension) - if match: - number = int(match.group(1)) - unit = match.group(2) or 'px' # Default unit is 'px' if not specified - return number, unit - return None, None - # Fetch image file metadata to extract size and extension - def fetch_image_file_size(img, base_url): - #If src is relative path construct full URL, if not it may be CDN URL - img_url = urljoin(base_url,img.get('src')) - try: - response = requests.head(img_url) - if response.status_code == 200: - return response.headers.get('Content-Length',None) - else: - print(f"Failed to retrieve file size for {img_url}") - return None - except InvalidSchema as e: - return None - finally: - return image_height = img.get('height') height_value, height_unit = parse_dimension(image_height) @@ -294,7 +310,6 @@ class WebScrappingStrategy(ContentScrappingStrategy): exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) exclude_social_media_domains = list(set(exclude_social_media_domains)) - try: if element.name == 'a' and element.get('href'): @@ -439,15 +454,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): except Exception as e: print('Error processing element:', str(e)) return False - - #process images by filtering and extracting contextual text from the page - # imgs = body.find_all('img') - # media['images'] = [ - # result for result in - # (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs)) - # if result is not None - # ] - + process_element(body) # Update the links dictionary with unique links @@ -478,8 +485,9 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Replace base64 data with empty string img['src'] = base64_pattern.sub('', src) + str_body = "" try: - str(body) + str_body = body.encode_contents().decode('utf-8') except Exception as e: # Reset body to the original HTML success = False @@ -504,11 +512,12 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Append the error div to the body body.body.append(error_div) + str_body = body.encode_contents().decode('utf-8') print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") - cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') + cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') try: h = CustomHTML2Text() @@ -518,15 +527,14 @@ class WebScrappingStrategy(ContentScrappingStrategy): markdown = h.handle(sanitize_html(cleaned_html)) markdown = markdown.replace(' ```', '```') - try: - meta = extract_metadata(html, soup) - except Exception as e: - print('Error extracting metadata:', str(e)) - meta = {} + - cleaner = ContentCleaningStrategy() - fit_html = cleaner.clean(cleaned_html) - fit_markdown = h.handle(fit_html) + fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." + fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." + if kwargs.get('fit_markdown', False): + cleaner = ContentCleaningStrategy() + fit_html = cleaner.clean(cleaned_html) + fit_markdown = h.handle(fit_html) cleaned_html = sanitize_html(cleaned_html) return { diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d96f1ded..d8bd6992 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -736,46 +736,54 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: 'metadata': meta } -def extract_metadata(html, soup = None): +def extract_metadata(html, soup=None): metadata = {} - if not html: + if not html and not soup: + return {} + + if not soup: + soup = BeautifulSoup(html, 'lxml') + + head = soup.head + if not head: return metadata - # Parse HTML content with BeautifulSoup - if not soup: - soup = BeautifulSoup(html, 'html.parser') - # Title - title_tag = soup.find('title') - metadata['title'] = title_tag.string if title_tag else None + title_tag = head.find('title') + metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None # Meta description - description_tag = soup.find('meta', attrs={'name': 'description'}) - metadata['description'] = description_tag['content'] if description_tag else None + description_tag = head.find('meta', attrs={'name': 'description'}) + metadata['description'] = description_tag.get('content', '').strip() if description_tag else None # Meta keywords - keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) - metadata['keywords'] = keywords_tag['content'] if keywords_tag else None + keywords_tag = head.find('meta', attrs={'name': 'keywords'}) + metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None # Meta author - author_tag = soup.find('meta', attrs={'name': 'author'}) - metadata['author'] = author_tag['content'] if author_tag else None + author_tag = head.find('meta', attrs={'name': 'author'}) + metadata['author'] = author_tag.get('content', '').strip() if author_tag else None # Open Graph metadata - og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')}) + og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')}) for tag in og_tags: - property_name = tag['property'] - metadata[property_name] = tag['content'] + property_name = tag.get('property', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content # Twitter Card metadata - twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')}) + twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')}) for tag in twitter_tags: - property_name = tag['name'] - metadata[property_name] = tag['content'] - + property_name = tag.get('name', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content + return metadata + def extract_xml_tags(string): tags = re.findall(r'<(\w+)>', string) return list(set(tags)) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 95af6c7a..c97a9cf4 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,7 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor -from .content_scrapping_strategy import WebScrappingStrategy +from .content_scrapping_strategy import WebScrapingStrategy from .config import * import warnings import json @@ -182,7 +182,7 @@ class WebCrawler: # Extract content from HTML try: t1 = time.time() - scrapping_strategy = WebScrappingStrategy() + scrapping_strategy = WebScrapingStrategy() extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]} result = scrapping_strategy.scrap( url, diff --git a/docs/md_v2/basic/prefix-based-input.md b/docs/md_v2/basic/prefix-based-input.md new file mode 100644 index 00000000..42987a67 --- /dev/null +++ b/docs/md_v2/basic/prefix-based-input.md @@ -0,0 +1,235 @@ +# Prefix-Based Input Handling in Crawl4AI + +This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example. + +## Table of Contents +- [Prefix-Based Input Handling in Crawl4AI](#prefix-based-input-handling-in-crawl4ai) + - [Table of Contents](#table-of-contents) + - [Crawling a Web URL](#crawling-a-web-url) + - [Crawling a Local HTML File](#crawling-a-local-html-file) + - [Crawling Raw HTML Content](#crawling-raw-html-content) + - [Complete Example](#complete-example) + - [**How It Works**](#how-it-works) + - [**Running the Example**](#running-the-example) + - [Conclusion](#conclusion) + +--- + + +### Crawling a Web URL + +To crawl a live web page, provide the URL starting with `http://` or `https://`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_web(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", bypass_cache=True) + if result.success: + print("Markdown Content:") + print(result.markdown) + else: + print(f"Failed to crawl: {result.error_message}") + +asyncio.run(crawl_web()) +``` + +### Crawling a Local HTML File + +To crawl a local HTML file, prefix the file path with `file://`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_local_file(): + local_file_path = "/path/to/apple.html" # Replace with your file path + file_url = f"file://{local_file_path}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url=file_url, bypass_cache=True) + if result.success: + print("Markdown Content from Local File:") + print(result.markdown) + else: + print(f"Failed to crawl local file: {result.error_message}") + +asyncio.run(crawl_local_file()) +``` + +### Crawling Raw HTML Content + +To crawl raw HTML content, prefix the HTML string with `raw:`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_raw_html(): + raw_html = "

Hello, World!

" + raw_html_url = f"raw:{raw_html}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url=raw_html_url, bypass_cache=True) + if result.success: + print("Markdown Content from Raw HTML:") + print(result.markdown) + else: + print(f"Failed to crawl raw HTML: {result.error_message}") + +asyncio.run(crawl_raw_html()) +``` + +--- + +## Complete Example + +Below is a comprehensive script that: +1. **Crawls the Wikipedia page for "Apple".** +2. **Saves the HTML content to a local file (`apple.html`).** +3. **Crawls the local HTML file and verifies the markdown length matches the original crawl.** +4. **Crawls the raw HTML content from the saved file and verifies consistency.** + +```python +import os +import sys +import asyncio +from pathlib import Path + +# Adjust the parent directory to include the crawl4ai module +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai import AsyncWebCrawler + +async def main(): + # Define the URL to crawl + wikipedia_url = "https://en.wikipedia.org/wiki/apple" + + # Define the path to save the HTML file + # Save the file in the same directory as the script + script_dir = Path(__file__).parent + html_file_path = script_dir / "apple.html" + + async with AsyncWebCrawler(verbose=True) as crawler: + print("\n=== Step 1: Crawling the Wikipedia URL ===") + # Crawl the Wikipedia URL + result = await crawler.arun(url=wikipedia_url, bypass_cache=True) + + # Check if crawling was successful + if not result.success: + print(f"Failed to crawl {wikipedia_url}: {result.error_message}") + return + + # Save the HTML content to a local file + with open(html_file_path, 'w', encoding='utf-8') as f: + f.write(result.html) + print(f"Saved HTML content to {html_file_path}") + + # Store the length of the generated markdown + web_crawl_length = len(result.markdown) + print(f"Length of markdown from web crawl: {web_crawl_length}\n") + + print("=== Step 2: Crawling from the Local HTML File ===") + # Construct the file URL with 'file://' prefix + file_url = f"file://{html_file_path.resolve()}" + + # Crawl the local HTML file + local_result = await crawler.arun(url=file_url, bypass_cache=True) + + # Check if crawling was successful + if not local_result.success: + print(f"Failed to crawl local file {file_url}: {local_result.error_message}") + return + + # Store the length of the generated markdown from local file + local_crawl_length = len(local_result.markdown) + print(f"Length of markdown from local file crawl: {local_crawl_length}") + + # Compare the lengths + assert web_crawl_length == local_crawl_length, ( + f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Local file crawl ({local_crawl_length})" + ) + print("✅ Markdown length matches between web crawl and local file crawl.\n") + + print("=== Step 3: Crawling Using Raw HTML Content ===") + # Read the HTML content from the saved file + with open(html_file_path, 'r', encoding='utf-8') as f: + raw_html_content = f.read() + + # Prefix the raw HTML content with 'raw:' + raw_html_url = f"raw:{raw_html_content}" + + # Crawl using the raw HTML content + raw_result = await crawler.arun(url=raw_html_url, bypass_cache=True) + + # Check if crawling was successful + if not raw_result.success: + print(f"Failed to crawl raw HTML content: {raw_result.error_message}") + return + + # Store the length of the generated markdown from raw HTML + raw_crawl_length = len(raw_result.markdown) + print(f"Length of markdown from raw HTML crawl: {raw_crawl_length}") + + # Compare the lengths + assert web_crawl_length == raw_crawl_length, ( + f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Raw HTML crawl ({raw_crawl_length})" + ) + print("✅ Markdown length matches between web crawl and raw HTML crawl.\n") + + print("All tests passed successfully!") + + # Clean up by removing the saved HTML file + if html_file_path.exists(): + os.remove(html_file_path) + print(f"Removed the saved HTML file: {html_file_path}") + +# Run the main function +if __name__ == "__main__": + asyncio.run(main()) +``` + +### **How It Works** + +1. **Step 1: Crawl the Web URL** + - Crawls `https://en.wikipedia.org/wiki/apple`. + - Saves the HTML content to `apple.html`. + - Records the length of the generated markdown. + +2. **Step 2: Crawl from the Local HTML File** + - Uses the `file://` prefix to crawl `apple.html`. + - Ensures the markdown length matches the original web crawl. + +3. **Step 3: Crawl Using Raw HTML Content** + - Reads the HTML from `apple.html`. + - Prefixes it with `raw:` and crawls. + - Verifies the markdown length matches the previous results. + +4. **Cleanup** + - Deletes the `apple.html` file after testing. + +### **Running the Example** + +1. **Save the Script:** + - Save the above code as `test_crawl4ai.py` in your project directory. + +2. **Execute the Script:** + - Run the script using: + ```bash + python test_crawl4ai.py + ``` + +3. **Observe the Output:** + - The script will print logs detailing each step. + - Assertions ensure consistency across different crawling methods. + - Upon success, it confirms that all markdown lengths match. + +--- + +## Conclusion + +With the new prefix-based input handling in **Crawl4AI**, you can effortlessly crawl web URLs, local HTML files, and raw HTML strings using a unified `url` parameter. This enhancement simplifies the API usage and provides greater flexibility for diverse crawling scenarios. + diff --git a/tests/async/sample_wikipedia.html b/tests/async/sample_wikipedia.html new file mode 100644 index 00000000..a22b3e3f --- /dev/null +++ b/tests/async/sample_wikipedia.html @@ -0,0 +1,2179 @@ + + +Apple - Wikipedia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Jump to content +
+
+
+ + + + +
+
+ + + + + +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+ +

Apple

+ +
+ + +
+ +
+ + + +
+ +
+
+
+
+
+
+ +
+
+ + + +
+
+
+
+
+ + +
+
+
+
+
+
This is a good article. Click here for more information.
+
Page semi-protected
+
+ +
From Wikipedia, the free encyclopedia
+
+
+ + +
+ + +

+ + + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Apple +
+
'Cripps Pink' apples +
+
Flowers +
Scientific classification Edit this classification +
Kingdom: +Plantae +
Clade: +Tracheophytes +
Clade: +Angiosperms +
Clade: +Eudicots +
Clade: +Rosids +
Order: +Rosales +
Family: +Rosaceae +
Genus: +Malus +
Species: +
M. domestica
+
Binomial name +
Malus domestica
+
Synonyms[1][2] +
+
  • M. communis Desf., 1768
  • +
  • M. pumila Mil.
  • +
  • M. frutescens Medik.
  • +
  • M. paradisiaca (L.) Medikus
  • +
  • M. sylvestris Mil.
  • +
  • Pyrus malus L.
  • +
  • Pyrus malus var. paradisiaca L.
  • +
  • Pyrus dioica Moench
+
+

An apple is a round, edible fruit produced by an apple tree (Malus spp., among them the domestic or orchard apple; Malus domestica). Apple trees are cultivated worldwide and are the most widely grown species in the genus Malus. The tree originated in Central Asia, where its wild ancestor, Malus sieversii, is still found. Apples have been grown for thousands of years in Eurasia and were introduced to North America by European colonists. Apples have religious and mythological significance in many cultures, including Norse, Greek, and European Christian tradition. +

Apples grown from seed tend to be very different from those of their parents, and the resultant fruit frequently lacks desired characteristics. For commercial purposes, including botanical evaluation, apple cultivars are propagated by clonal grafting onto rootstocks. Apple trees grown without rootstocks tend to be larger and much slower to fruit after planting. Rootstocks are used to control the speed of growth and the size of the resulting tree, allowing for easier harvesting. +

There are more than 7,500 cultivars of apples. Different cultivars are bred for various tastes and uses, including cooking, eating raw, and cider or apple juice production. Trees and fruit are prone to fungal, bacterial, and pest problems, which can be controlled by a number of organic and non-organic means. In 2010, the fruit's genome was sequenced as part of research on disease control and selective breeding in apple production. +

+ +

Etymology

+

The word apple, whose Old English ancestor is æppel, is descended from the Proto-Germanic noun *aplaz, descended in turn from Proto-Indo-European *h₂ébōl.[3] As late as the 17th century, the word also functioned as a generic term for all fruit, including nuts. This can be compared to the 14th-century Middle English expression appel of paradis, meaning a banana.[4] +

+

Description

+

The apple is a deciduous tree, generally standing 2 to 4.5 metres (6 to 15 feet) tall in cultivation and up to 15 m (49 ft) in the wild, though more typically 2 to 10 m (6.5 to 33 ft).[5][1] When cultivated, the size, shape and branch density are determined by rootstock selection and trimming method.[5] Apple trees may naturally have a rounded to erect crown with a dense canopy of leaves.[6] The bark of the trunk is dark gray or gray-brown, but young branches are reddish or dark-brown with a smooth texture.[1][7] When young twigs are covered in very fine downy hairs and become hairless as they become older.[7] +

The buds are egg-shaped and dark red or purple in color; they range in size from 3 to 5 millimeters, but are usually less than 4 mm. The bud scales have very hairy edges. When emerging from the buds, the leaves are convolute, meaning that their edges overlap each other.[1] Leaves can be simple ovals (elliptic), medium or wide in width, somewhat egg-shaped with the wider portion toward their base (ovate), or even with sides that are more parallel to each other instead of curved (oblong) with a narrow pointed end.[7][1] The edges have broadly-angled teeth, but do not have lobes. The top surface of the leaves are glabrescent, almost hairless, while the undersides are densely covered in fine hairs.[1] The leaves are attached alternately by short leaf stems 1-to-3.5 cm (12-to-1+12 in) long.[6][1] +

Blossoms are produced in spring simultaneously with the budding of the leaves and are produced on spurs and some long shoots.[5] When the flower buds first begin to open the petals are rose-pink and fade to white or light pink when fully open with each flower 3-to-4-centimeter (1-to-1+12-inch) in diameter.[1] The five-petaled flowers are group in an inflorescence consisting of a cyme with 3–7 flowers.[8] The central flower of the inflorescence is called the "king bloom"; it opens first and can develop a larger fruit.[6] Open apple blossoms are damaged by even brief exposures to temperatures −2 °C (28 °F) or less, although the overwintering wood and buds are hardy down to −40 °C (−40 °F).[8] +

+ +

Fruit

+

The fruit is a pome that matures in late summer or autumn.[1] The true fruits or carpels are the harder interior chambers inside the apple's core. There are usually five carpels inside an apple, but there may be as few as three. Each of the chambers contains one or two seeds.[9] The edible flesh is formed from the receptacle at the base of the flower.[10] +

+ +

The seeds are egg- to pear-shaped and may be colored from light brown or tan to a very dark brown, often with red shades or even purplish-black. They may have a blunt or sharp point.[11] The five sepals remain attached and stand out from the surface of the apple.[1] +

The size of the fruit varies widely between cultivars, but generally has a diameter between 2.5 and 12 cm (1 and 5 in).[7] The shape is quite variable and may be nearly round, elongated, conical, or short and wide.[12] +

The groundcolor of ripe apples is yellow, green, yellow-green or whitish yellow. The overcolor of ripe apples can be orange-red, pink-red, red, purple-red or brown-red. The overcolor amount can be 0–100%.[13] The skin may be wholly or partly russeted, making it rough and brown. The skin is covered in a protective layer of epicuticular wax.[14] The skin may also be marked with scattered dots.[1] The flesh is generally pale yellowish-white, though it can be pink, yellow or green.[13] +

+ +

Chemistry

+

Important volatile compounds in apples that contribute to their scent and flavour include acetaldehyde, ethyl acetate, 1-butanal, ethanol, 2-methylbutanal, 3-methylbutanal, ethyl propionate, ethyl 2-methylpropionate, ethyl butyrate, ethyl 2-methyl butyrate, hexanal, 1-butanol, 3-methylbutyl acetate, 2-methylbutyl acetate, 1-propyl butyrate, ethyl pentanoate, amyl acetate, 2-methyl-1-butanol, trans-2-hexenal, ethyl hexanoate, hexanol.[15][16] +

+

Taxonomy

+

The apple as a species has more than 100 alternative scientific names, or synonyms.[17] In modern times, Malus pumila and Malus domestica are the two main names in use. M. pumila is the older name, but M. domestica has become much more commonly used starting in the 21st century, especially in the western world. Two proposals were made to make M. domestica a conserved name: the earlier proposal was voted down by the Committee for Vascular Plants of the IAPT in 2014, but in April 2017 the Committee decided, with a narrow majority, that the newly popular name should be conserved.[18] The General Committee of the IAPT decided in June 2017 to approve this change, officially conserving M. domestica.[19] Nevertheless, some works published after 2017 still use M. pumila as the correct name, under an alternate taxonomy.[2] +

When first classified by Linnaeus in 1753, the pears, apples, and quinces were combined into one genus that he named Pyrus and he named the apple as Pyrus malus. This was widely accepted, however the botanist Philip Miller published an alternate classification in The Gardeners Dictionary with the apple species separated from Pyrus in 1754. He did not clearly indicate that by Malus pumila he meant the domesticated apple. Nonetheless, it was used as such by many botanists. When Moritz Balthasar Borkhausen published his scientific description of the apple in 1803 it may have been a new combination of P. malus var. domestica, but this was not directly referenced by Borkhausen.[17] The earliest use of var. domestica for the apple was by Georg Adolf Suckow in 1786.[2] +

+

Genome

+ +

Apples are diploid, with two sets of chromosomes per cell (though triploid cultivars, with three sets, are not uncommon), have 17 chromosomes and an estimated genome size of approximately 650 Mb. Several whole genome sequences have been completed and made available. The first one in 2010 was based on the diploid cultivar 'Golden Delicious'.[20] However, this first whole genome sequence contained several errors,[21] in part owing to the high degree of heterozygosity in diploid apples which, in combination with an ancient genome duplication, complicated the assembly. Recently, double- and trihaploid individuals have been sequenced, yielding whole genome sequences of higher quality.[22][23] +

The first whole genome assembly was estimated to contain around 57,000 genes,[20] though the more recent genome sequences support estimates between 42,000 and 44,700 protein-coding genes.[22][23] The availability of whole genome sequences has provided evidence that the wild ancestor of the cultivated apple most likely is Malus sieversii. Re-sequencing of multiple accessions has supported this, while also suggesting extensive introgression from Malus sylvestris following domestication.[24] +

+

Cultivation

+

History

+
Map of the origins of the cultivated apple. The wild origin is in Kazakhstan; hybridisations and repeated domestications followed, modifying many attributes of the fruit.[24]
+
color photograph of a hand holding a red apple
Wild Malus sieversii apple in Kazakhstan
+

Central Asia is generally considered the center of origin for apples due to the genetic variability in specimens there.[25] The wild ancestor of Malus domestica was Malus sieversii, found growing wild in the mountains of Central Asia in southern Kazakhstan, Kyrgyzstan, Tajikistan, and northwestern China.[5][26] Cultivation of the species, most likely beginning on the forested flanks of the Tian Shan mountains, progressed over a long period of time and permitted secondary introgression of genes from other species into the open-pollinated seeds. Significant exchange with Malus sylvestris, the crabapple, resulted in populations of apples being more related to crabapples than to the more morphologically similar progenitor Malus sieversii. In strains without recent admixture the contribution of the latter predominates.[27][28][29] +

The apple is thought to have been domesticated 4,000–10,000 years ago in the Tian Shan mountains, and then to have travelled along the Silk Road to Europe, with hybridization and introgression of wild crabapples from Siberia (M. baccata), the Caucasus (M. orientalis), and Europe (M. sylvestris). Only the M. sieversii trees growing on the western side of the Tian Shan mountains contributed genetically to the domesticated apple, not the isolated population on the eastern side.[24] +

Chinese soft apples, such as M. asiatica and M. prunifolia, have been cultivated as dessert apples for more than 2,000 years in China. These are thought to be hybrids between M. baccata and M. sieversii in Kazakhstan.[24] +

Among the traits selected for by human growers are size, fruit acidity, color, firmness, and soluble sugar. Unusually for domesticated fruits, the wild M. sieversii origin is only slightly smaller than the modern domesticated apple.[24] +

At the Sammardenchia-Cueis site near Udine in Northeastern Italy, seeds from some form of apples have been found in material carbon dated to between 6570 and 5684 BCE.[30] Genetic analysis has not yet been successfully used to determine whether such ancient apples were wild Malus sylvestris or Malus domesticus containing Malus sieversii ancestry. It is hard to distinguish in the archeological record between foraged wild apples and apple plantations.[31] +

There is indirect evidence of apple cultivation in the third millennium BCE in the Middle East.[31] There is direct evidence, apple cores, dated to the 10th century BCE from a Judean site between the Sinai and Negev. +[32] There was substantial apple production in European classical antiquity, and grafting was certainly known then.[31] Grafting is an essential part of modern domesticated apple production, to be able to propagate the best cultivars; it is unclear when apple tree grafting was invented.[31] +

+ +

The Roman writer Pliny the Elder describes a method of storage for apples from his time in the 1st century. He says they should be placed in a room with good air circulation from a north facing window on a bed of straw, chaff, or mats with windfalls kept separately.[33] Though methods like this will extend the availabity of reasonably fresh apples, without refrigeration their lifespan is limited. Even sturdy winter apple varieties will only keep well until December in cool climates.[34] For longer storage medieval Europeans strung up cored and peeled apples to dry, either whole or sliced into rings.[35] +

Of the many Old World plants that the Spanish introduced to Chiloé Archipelago in the 16th century, apple trees became particularly well adapted.[36] Apples were introduced to North America by colonists in the 17th century,[5] and the first named apple cultivar was introduced in Boston by Reverend William Blaxton in 1640.[37] The only apples native to North America are crab apples.[38] +

Apple cultivars brought as seed from Europe were spread along Native American trade routes, as well as being cultivated on colonial farms. An 1845 United States apples nursery catalogue sold 350 of the "best" cultivars, showing the proliferation of new North American cultivars by the early 19th century.[38] In the 20th century, irrigation projects in Eastern Washington began and allowed the development of the multibillion-dollar fruit industry, of which the apple is the leading product.[5] +

Until the 20th century, farmers stored apples in frostproof cellars during the winter for their own use or for sale. Improved transportation of fresh apples by train and road replaced the necessity for storage.[39][40] Controlled atmosphere facilities are used to keep apples fresh year-round. Controlled atmosphere facilities use high humidity, low oxygen, and controlled carbon dioxide levels to maintain fruit freshness. They were first researched at Cambridge University in the 1920s and first used in the United States in the 1950s.[41] +

+

Breeding

+ +
An apple tree in Germany
+

Many apples grow readily from seeds. However, apples must be propagated asexually to obtain cuttings with the characteristics of the parent. This is because seedling apples are "extreme heterozygotes". Rather than resembling their parents, seedlings are all different from each other and from their parents.[42] Triploid cultivars have an additional reproductive barrier in that three sets of chromosomes cannot be divided evenly during meiosis, yielding unequal segregation of the chromosomes (aneuploids). Even in the case when a triploid plant can produce a seed (apples are an example), it occurs infrequently, and seedlings rarely survive.[43] +

Because apples are not true breeders when planted as seeds, propagation usually involves grafting of cuttings. The rootstock used for the bottom of the graft can be selected to produce trees of a large variety of sizes, as well as changing the winter hardiness, insect and disease resistance, and soil preference of the resulting tree. Dwarf rootstocks can be used to produce very small trees (less than 3.0 m or 10 ft high at maturity), which bear fruit many years earlier in their life cycle than full size trees, and are easier to harvest.[44] +

Dwarf rootstocks for apple trees can be traced as far back as 300 BCE, to the area of Persia and Asia Minor. Alexander the Great sent samples of dwarf apple trees to Aristotle's Lyceum. Dwarf rootstocks became common by the 15th century and later went through several cycles of popularity and decline throughout the world.[45] The majority of the rootstocks used to control size in apples were developed in England in the early 1900s. The East Malling Research Station conducted extensive research into rootstocks, and their rootstocks are given an "M" prefix to designate their origin. Rootstocks marked with an "MM" prefix are Malling-series cultivars later crossed with trees of 'Northern Spy' in Merton, England.[46] +

Most new apple cultivars originate as seedlings, which either arise by chance or are bred by deliberately crossing cultivars with promising characteristics.[47] The words "seedling", "pippin", and "kernel" in the name of an apple cultivar suggest that it originated as a seedling. Apples can also form bud sports (mutations on a single branch). Some bud sports turn out to be improved strains of the parent cultivar. Some differ sufficiently from the parent tree to be considered new cultivars.[48] +

Apples have been acclimatized in Ecuador at very high altitudes, where they can often, with the needed factors, provide crops twice per year because of constant temperate conditions year-round.[49] +

+

Pollination

+ +
Apple blossom from an old Ayrshire cultivar
+
An orchard mason bee on an apple bloom in British Columbia, Canada
+

Apples are self-incompatible; they must cross-pollinate to develop fruit. During the flowering each season, apple growers often utilize pollinators to carry pollen. Honey bees are most commonly used. Orchard mason bees are also used as supplemental pollinators in commercial orchards. Bumblebee queens are sometimes present in orchards, but not usually in sufficient number to be significant pollinators.[48][50] +

Cultivars are sometimes classified by the day of peak bloom in the average 30-day blossom period, with pollinizers selected from cultivars within a 6-day overlap period. There are four to seven pollination groups in apples, depending on climate: +

+
  • Group A – Early flowering, 1 to 3 May in England ('Gravenstein', 'Red Astrachan')
  • +
  • Group B – 4 to 7 May ('Idared', 'McIntosh')
  • +
  • Group C – Mid-season flowering, 8 to 11 May ('Granny Smith', 'Cox's Orange Pippin')
  • +
  • Group D – Mid/late season flowering, 12 to 15 May ('Golden Delicious', 'Calville blanc d'hiver')
  • +
  • Group E – Late flowering, 16 to 18 May ('Braeburn', 'Reinette d'Orléans')
  • +
  • Group F – 19 to 23 May ('Suntan')
  • +
  • Group H – 24 to 28 May ('Court-Pendu Gris' – also called Court-Pendu plat)
+

One cultivar can be pollinated by a compatible cultivar from the same group or close (A with A, or A with B, but not A with C or D).[51] +

+

Maturation and harvest

+ +
L. K. Relander, the former President of Finland, with his family picking apples in the 1930s
+

Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock. Some cultivars, if left unpruned, grow very large—letting them bear more fruit, but making harvesting more difficult. Depending on tree density (number of trees planted per unit surface area), mature trees typically bear 40–200 kg (90–440 lb) of apples each year, though productivity can be close to zero in poor years. Apples are harvested using three-point ladders that are designed to fit amongst the branches. Trees grafted on dwarfing rootstocks bear about 10–80 kg (20–180 lb) of fruit per year.[48] +

Some farms with apple orchards open them to the public so consumers can pick their own apples.[52] +

Crops ripen at different times of the year according to the cultivar. Cultivar that yield their crop in the summer include 'Sweet Bough' and 'Duchess'; fall producers include 'Blenheim'; winter producers include 'King', 'Swayzie', and 'Tolman Sweet'.[38] +

+

Storage

+
Different apple cultivars in a wholesale food market
+

Commercially, apples can be stored for months in controlled atmosphere chambers. Apples are commonly stored in chambers with lowered concentrations of oxygen to reduce respiration and slow softening and other changes if the fruit is already fully ripe. The gas ethylene is used by plants as a hormone which promotes ripening, decreasing the time an apple can be stored. For storage longer than about six months the apples are picked earlier, before full ripeness, when ethylene production by the fruit is low. However, in many varieties this increases their sensitivity to carbon dioxide, which also must be controlled.[53] +

For home storage, most culitvars of apple can be stored for three weeks in a pantry and four to six weeks from the date of purchase in a refrigerator that maintains 4 to 0 °C (39 to 32 °F).[54][55] Some varieties of apples (e.g. 'Granny Smith' and 'Fuji') have more than three times the storage life of others.[56] +

Non-organic apples may be sprayed with a substance 1-methylcyclopropene blocking the apples' ethylene receptors, temporarily preventing them from ripening.[57] +

+

Pests and diseases

+ +
Codling moth larva tunnelling inside an apple
+

Apple trees are susceptible to fungal and bacterial diseases, and to damage by insect pests. Many commercial orchards pursue a program of chemical sprays to maintain high fruit quality, tree health, and high yields. These prohibit the use of synthetic pesticides, though some older pesticides are allowed. Organic methods include, for instance, introducing its natural predator to reduce the population of a particular pest. +

A wide range of pests and diseases can affect the plant. Three of the more common diseases or pests are mildew, aphids, and apple scab. +

+
  • Mildew is characterized by light grey powdery patches appearing on the leaves, shoots and flowers, normally in spring. The flowers turn a creamy yellow color and do not develop correctly. This can be treated similarly to Botrytis—eliminating the conditions that caused the disease and burning the infected plants are among recommended actions.[58]
  • +
  • Aphids are small insects with sucking mouthparts. Five species of aphids commonly attack apples: apple grain aphid, rosy apple aphid, apple aphid, spirea aphid, and the woolly apple aphid. The aphid species can be identified by color, time of year, and by differences in the cornicles (small paired projections from their rear).[59] Aphids feed on foliage using needle-like mouth parts to suck out plant juices. When present in high numbers, certain species reduce tree growth and vigor.[60]
  • +
  • Apple scab: Apple scab causes leaves to develop olive-brown spots with a velvety texture that later turn brown and become cork-like in texture. The disease also affects the fruit, which also develops similar brown spots with velvety or cork-like textures. Apple scab is spread through fungus growing in old apple leaves on the ground and spreads during warm spring weather to infect the new year's growth.[61]
+

Among the most serious disease problems is a bacterial disease called fireblight, and three fungal diseases: Gymnosporangium rust, black spot,[62] and bitter rot.[63] Codling moths, and the apple maggots of fruit flies, cause serious damage to apple fruits, making them unsaleable. Young apple trees are also prone to mammal pests like mice and deer, which feed on the soft bark of the trees, especially in winter.[61] The larvae of the apple clearwing moth (red-belted clearwing) burrow through the bark and into the phloem of apple trees, potentially causing significant damage.[64] +

+

Cultivars

+ +
An assortment of apple cultivars
+

There are more than 7,500 known cultivars (cultivated varieties) of apples.[65] Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock.[66] Different cultivars are available for temperate and subtropical climates. The UK's National Fruit Collection, which is the responsibility of the Department of Environment, Food, and Rural Affairs, includes a collection of over 2,000 cultivars of apple tree in Kent.[67] The University of Reading, which is responsible for developing the UK national collection database, provides access to search the national collection. The University of Reading's work is part of the European Cooperative Programme for Plant Genetic Resources of which there are 38 countries participating in the Malus/Pyrus work group.[68] +

The UK's national fruit collection database contains much information on the characteristics and origin of many apples, including alternative names for what is essentially the same "genetic" apple cultivar. Most of these cultivars are bred for eating fresh (dessert apples), though some are cultivated specifically for cooking (cooking apples) or producing cider. Cider apples are typically too tart and astringent to eat fresh, but they give the beverage a rich flavor that dessert apples cannot.[69] +

In the United States there are many apple breeding programs associated with universities. Cornell University has had a program operating since 1880 in Geneva, New York. Among their recent well known apples is the 'SnapDragon' cultivar released in 2013. In the west Washington State University started a program to support their apple industry in 1994 and released the 'Cosmic Crisp' cultivar in 2017. The third most grown apple cultivar in the United States is the 'Honeycrisp', released by the University of Minnesota program in 1991.[70] Unusually for a popular cultivar, the 'Honeycrisp' is not directly related to another popular apple cultivar but instead to two unsuccessful cultivars.[71] In Europe there are also many breeding programs such as the Julius Kühn-Institut, the German federal research center for cultivated plants.[72] +

Commercially popular apple cultivars are soft but crisp. Other desirable qualities in modern commercial apple breeding are a colorful skin, absence of russeting, ease of shipping, lengthy storage ability, high yields, disease resistance, common apple shape, and developed flavor.[66] Modern apples are generally sweeter than older cultivars, as popular tastes in apples have varied over time. Most North Americans and Europeans favor sweet, subacid apples, but tart apples have a strong minority following.[73] Extremely sweet apples with barely any acid flavor are popular in Asia,[73] especially the Indian subcontinent.[69] +

+
Less common apple cultivars from an orchard in Italy
+

Old cultivars are often oddly shaped, russeted, and grow in a variety of textures and colors. Some find them to have better flavor than modern cultivars, but they may have other problems that make them commercially unviable—low yield, disease susceptibility, poor tolerance for storage or transport, or just being the "wrong" size.[74] A few old cultivars are still produced on a large scale, but many have been preserved by home gardeners and farmers that sell directly to local markets. Many unusual and locally important cultivars with their own unique taste and appearance exist; apple conservation campaigns have sprung up around the world to preserve such local cultivars from extinction. In the United Kingdom, old cultivars such as 'Cox's Orange Pippin' and 'Egremont Russet' are still commercially important even though by modern standards they are low yielding and susceptible to disease.[5] +

+

Production

+ + + + + + + + + + + + + + + + + + + + + + + +
Apple production
+

2022, millions of tonnes
+

+
 China47.6 +
 United States4.8 +
 Turkey4.4 +
 Poland4.3 +
 India2.6 +
World95.8 +
Source: FAOSTAT of the United Nations[75] +
+

World production of apples in 2022 was 96 million tonnes, with China producing 50% of the total (table).[75] Secondary producers were the United States, Turkey, and Poland.[75] +

+

Toxicity

+

Amygdalin

+

Apple seeds contain small amounts of amygdalin, a sugar and cyanide compound known as a cyanogenic glycoside. Ingesting small amounts of apple seeds causes no ill effects, but consumption of extremely large doses can cause adverse reactions. It may take several hours before the poison takes effect, as cyanogenic glycosides must be hydrolyzed before the cyanide ion is released.[76] The U.S. National Library of Medicine's Hazardous Substances Data Bank records no cases of amygdalin poisoning from consuming apple seeds.[77] +

+

Allergy

+

One form of apple allergy, often found in northern Europe, is called birch-apple syndrome and is found in people who are also allergic to birch pollen.[78] Allergic reactions are triggered by a protein in apples that is similar to birch pollen, and people affected by this protein can also develop allergies to other fruits, nuts, and vegetables. Reactions, which entail oral allergy syndrome (OAS), generally involve itching and inflammation of the mouth and throat,[78] but in rare cases can also include life-threatening anaphylaxis.[79] This reaction only occurs when raw fruit is consumed—the allergen is neutralized in the cooking process. The variety of apple, maturity and storage conditions can change the amount of allergen present in individual fruits. Long storage times can increase the amount of proteins that cause birch-apple syndrome.[78] +

In other areas, such as the Mediterranean, some individuals have adverse reactions to apples because of their similarity to peaches.[78] This form of apple allergy also includes OAS, but often has more severe symptoms, such as vomiting, abdominal pain and urticaria, and can be life-threatening. Individuals with this form of allergy can also develop reactions to other fruits and nuts. Cooking does not break down the protein causing this particular reaction, so affected individuals cannot eat raw or cooked apples. Freshly harvested, over-ripe fruits tend to have the highest levels of the protein that causes this reaction.[78] +

Breeding efforts have yet to produce a hypoallergenic fruit suitable for either of the two forms of apple allergy.[78] +

+

Uses

+ +

Nutrition

+
+ +
Apples, with skin (edible parts)
Nutritional value per 100 g (3.5 oz)
Energy218 kJ (52 kcal)
13.81 g
Sugars10.39
Dietary fiber2.4 g
+
0.17 g
+
0.26 g
+ + + + +
Vitamins and minerals
+
VitaminsQuantity
%DV
Vitamin A equiv.
0%
3 μg
0%
27 μg
29 μg
Thiamine (B1)
1%
0.017 mg
Riboflavin (B2)
2%
0.026 mg
Niacin (B3)
1%
0.091 mg
Pantothenic acid (B5)
1%
0.061 mg
Vitamin B6
2%
0.041 mg
Folate (B9)
1%
3 μg
Vitamin C
5%
4.6 mg
Vitamin E
1%
0.18 mg
Vitamin K
2%
2.2 μg
+
MineralsQuantity
%DV
Calcium
0%
6 mg
Iron
1%
0.12 mg
Magnesium
1%
5 mg
Manganese
2%
0.035 mg
Phosphorus
1%
11 mg
Potassium
4%
107 mg
Sodium
0%
1 mg
Zinc
0%
0.04 mg
+
Other constituentsQuantity
Water85.56 g
+

Percentages estimated using US recommendations for adults,[80] except for potassium, which is estimated based on expert recommendation from the National Academies.[81]
+
+

A raw apple is 86% water and 14% carbohydrates, with negligible content of fat and protein (table). A reference serving of a raw apple with skin weighing 100 g (3.5 oz) provides 52 calories and a moderate content of dietary fiber (table). Otherwise, there is low content of micronutrients, with the Daily Values of all falling below 10% (table). +

+

Culinary

+ +
Machine for paring, coring, and slicing apples, from Henry B. Scammell's 1897 handbook Cyclopedia of Valuable Receipts
+

Apples varieties can be grouped as cooking apples, eating apples, and cider apples, the last so astringent as to be "almost inedible".[82] Apples are consumed as juice, raw in salads, baked in pies, cooked into sauces and apple butter, or baked.[83] They are sometimes used as an ingredient in savory foods, such as sausage and stuffing.[84] +

Several techniques are used to preserve apples and apple products. Traditional methods include drying and making apple butter.[82] Juice and cider are produced commercially; cider is a significant industry in regions such as the West of England and Normandy.[82] +

A toffee apple (UK) or caramel apple (US) is a confection made by coating an apple in hot toffee or caramel candy respectively and allowing it to cool.[85][8] Apples and honey are a ritual food pairing eaten during the Jewish New Year of Rosh Hashanah.[86] +

Apples are an important ingredient in many desserts, such as pies, crumbles, and cakes. When cooked, some apple cultivars easily form a puree known as apple sauce, which can be cooked down to form a preserve, apple butter. They are often baked or stewed, and are cooked in some meat dishes.[82] +

+ +

Apples are milled or pressed to produce apple juice, which may be drunk unfiltered (called apple cider in North America), or filtered. Filtered juice is often concentrated and frozen, then reconstituted later and consumed. Apple juice can be fermented to make cider (called hard cider in North America), ciderkin, and vinegar.[8] Through distillation, various alcoholic beverages can be produced, such as applejack, Calvados, and apple brandy.[8][87] +

+

Organic production

+

Organic apples are commonly produced in the United States.[88] Due to infestations by key insects and diseases, organic production is difficult in Europe.[89] The use of pesticides containing chemicals, such as sulfur, copper, microorganisms, viruses, clay powders, or plant extracts (pyrethrum, neem) has been approved by the EU Organic Standing Committee to improve organic yield and quality.[89] A light coating of kaolin, which forms a physical barrier to some pests, also may help prevent apple sun scalding.[48] +

+

Non-browning apples

+

Apple skins and seeds contain polyphenols.[90] These are oxidised by the enzyme polyphenol oxidase, which causes browning in sliced or bruised apples, by catalyzing the oxidation of phenolic compounds to o-quinones, a browning factor.[91] Browning reduces apple taste, color, and food value. Arctic apples, a non-browning group of apples introduced to the United States market in 2019, have been genetically modified to silence the expression of polyphenol oxidase, thereby delaying a browning effect and improving apple eating quality.[92][93] The US Food and Drug Administration in 2015, and Canadian Food Inspection Agency in 2017, determined that Arctic apples are as safe and nutritious as conventional apples.[94][95] +

+

Other products

+

Apple seed oil is obtained by pressing apple seeds for manufacturing cosmetics.[96] +

+

In culture

+ +

Germanic paganism

+
Illustration of girl in a red dress, holding 3 candles in one hand and a basket of apples in the other
"Brita as Iduna" (1901) by Carl Larsson
+

In Norse mythology, the goddess Iðunn is portrayed in the Prose Edda (written in the 13th century by Snorri Sturluson) as providing apples to the gods that give them eternal youthfulness. The English scholar H. R. Ellis Davidson links apples to religious practices in Germanic paganism, from which Norse paganism developed. She points out that buckets of apples were found in the Oseberg ship burial site in Norway, that fruit and nuts (Iðunn having been described as being transformed into a nut in Skáldskaparmál) have been found in the early graves of the Germanic peoples in England and elsewhere on the continent of Europe, which may have had a symbolic meaning, and that nuts are still a recognized symbol of fertility in southwest England.[97] +

Davidson notes a connection between apples and the Vanir, a tribe of gods associated with fertility in Norse mythology, citing an instance of eleven "golden apples" being given to woo the beautiful Gerðr by Skírnir, who was acting as messenger for the major Vanir god Freyr in stanzas 19 and 20 of Skírnismál. Davidson also notes a further connection between fertility and apples in Norse mythology in chapter 2 of the Völsunga saga: when the major goddess Frigg sends King Rerir an apple after he prays to Odin for a child, Frigg's messenger (in the guise of a crow) drops the apple in his lap as he sits atop a mound.[97] Rerir's wife's consumption of the apple results in a six-year pregnancy and the birth (by Caesarean section) of their son—the hero Völsung.[98] +

Further, Davidson points out the "strange" phrase "Apples of Hel" used in an 11th-century poem by the skald Thorbiorn Brúnarson. She states this may imply that the apple was thought of by Brúnarson as the food of the dead. Further, Davidson notes that the potentially Germanic goddess Nehalennia is sometimes depicted with apples and that parallels exist in early Irish stories. Davidson asserts that while cultivation of the apple in Northern Europe extends back to at least the time of the Roman Empire and came to Europe from the Near East, the native varieties of apple trees growing in Northern Europe are small and bitter. Davidson concludes that in the figure of Iðunn "we must have a dim reflection of an old symbol: that of the guardian goddess of the life-giving fruit of the other world."[97] +

+

Greek mythology

+
Heracles with the apple of Hesperides
+

Apples appear in many religious traditions, including Greek and Roman mythology where it has an ambiguous symbolism of discord, fertility, or courtship.[99] In Greek mythology, the Greek hero Heracles, as a part of his Twelve Labours, was required to travel to the Garden of the Hesperides and pick the golden apples off the Tree of Life growing at its center.[100] +

The Greek goddess of discord, Eris, became disgruntled after she was excluded from the wedding of Peleus and Thetis.[101] In retaliation, she tossed a golden apple inscribed Καλλίστη (Kallistē, "For the most beautiful one"), into the wedding party. Three goddesses claimed the apple: Hera, Athena, and Aphrodite. Paris of Troy was appointed to select the recipient. After being bribed by both Hera and Athena, Aphrodite tempted him with the most beautiful woman in the world, Helen of Sparta. He awarded the apple to Aphrodite, thus indirectly causing the Trojan War.[102][103] +

The apple was thus considered, in ancient Greece, sacred to Aphrodite. To throw an apple at someone was to symbolically declare one's love; and similarly, to catch it was to symbolically show one's acceptance of that love. An epigram claiming authorship by Plato states:[104] +

+

I throw the apple at you, and if you are willing to love me, take it and share your girlhood with me; but if your thoughts are what I pray they are not, even then take it, and consider how short-lived is beauty.

— Plato, Epigram VII
+

Atalanta, also of Greek mythology, raced all her suitors in an attempt to avoid marriage. She outran all but Hippomenes (also known as Melanion, a name possibly derived from melon, the Greek word for both "apple" and fruit in general),[100] who defeated her by cunning, not speed. Hippomenes knew that he could not win in a fair race, so he used three golden apples (gifts of Aphrodite, the goddess of love) to distract Atalanta. It took all three apples and all of his speed, but Hippomenes was finally successful, winning the race and Atalanta's hand.[105][106] +

+

Celtic mythology

+

In Celtic mythology, the otherworld has many names, including Emain Ablach, "Emain of the Apple-trees". A version of this is Avalon in Arthurian legend, or in Welsh Ynys Afallon, "Island of Apples".[107] +

+

China

+
Píngānguǒ ("Peace apples") on sale in Beijing for Christmas Eve (2017)
+

In China, apples symbolise peace, since the sounds of the first element ("píng") in the words "apple" (苹果, Píngguǒ) and "peace" (平安, Píng'ān) are homophonous in Mandarin and Cantonese.[3][108] When these two words are combined, the word Píngānguǒ (平安果, "Peace apples") is formed. This association developed further as the name for Christmas Eve in Mandarin is Píngānyè (平安夜, "Peaceful/Quiet Evening"), which made the gifting of apples at this season to friends and associates popular, as a way to wish them peace and safety.[108] +

+

Christian art

+
Adam and Eve by Albrecht Dürer (1507), showcasing the apple as a symbol of sin
+

Though the forbidden fruit of Eden in the Book of Genesis is not identified, popular Christian tradition has held that it was an apple that Eve coaxed Adam to share with her.[109] The origin of the popular identification with a fruit unknown in the Middle East in biblical times is found in wordplay with the Latin words mālum (an apple) and mălum (an evil), each of which is normally written malum.[110] The tree of the forbidden fruit is called "the tree of the knowledge of good and evil" in Genesis 2:17,[111] and the Latin for "good and evil" is bonum et malum.[112] +

Renaissance painters may also have been influenced by the story of the golden apples in the Garden of Hesperides. As a result, in the story of Adam and Eve, the apple became a symbol for knowledge, immortality, temptation, the fall of man into sin, and sin itself. The larynx in the human throat has been called the "Adam's apple" because of a notion that it was caused by the forbidden fruit remaining in the throat of Adam. The apple as symbol of sexual seduction has been used to imply human sexuality, possibly in an ironic vein.[109] +

+

Proverb

+

The proverb, "An apple a day keeps the doctor away", addressing the supposed health benefits of the fruit, has been traced to 19th-century Wales, where the original phrase was "Eat an apple on going to bed, and you'll keep the doctor from earning his bread".[113] In the 19th century and early 20th, the phrase evolved to "an apple a day, no doctor to pay" and "an apple a day sends the doctor away"; the phrasing now commonly used was first recorded in 1922.[114] +

+

See also

+ +

References

+
+
    +
  1. ^ Jump up to: a b c d e f g h i j k Dickson, Elizabeth E. (28 May 2021). "Malus domestica". Flora of North America. Archived from the original on 28 July 2024. Retrieved 27 July 2024. +
  2. +
  3. ^ Jump up to: a b c "Malus domestica (Suckow) Borkh". Plants of the World Online. Royal Botanic Gardens, Kew. Retrieved 31 July 2024. +
  4. +
  5. ^ Jump up to: a b Lim, Lisa (6 July 2021). "Where the word 'apple' came from and why the forbidden fruit was unlucky to be linked with the fall of man". Language Matters. South China Morning Post. Hong Kong, China: Alibaba Group. Archived from the original on 28 June 2023. Retrieved 28 June 2023. +
  6. +
  7. ^ "Origin and meaning of "apple" by Online Etymology Dictionary". Online Etymology Dictionary. Archived from the original on 21 December 2019. Retrieved 22 November 2019. +
  8. +
  9. ^ Jump up to: a b c d e f g Rieger, Mark. "Apple - Malus domestica". HORT 3020: Intro Fruit Crops. University of Georgia. Archived from the original on 21 January 2008. Retrieved 22 January 2008. +
  10. +
  11. ^ Jump up to: a b c "Apples - Malus domestica". North Carolina Extension Gardener Plant Toolbox. North Carolina State University. Archived from the original on 31 May 2024. Retrieved 31 July 2024. +
  12. +
  13. ^ Jump up to: a b c d Heil, Kenneth D.; O'Kane, Jr., Steve L.; Reeves, Linda Mary; Clifford, Arnold (2013). Flora of the Four Corners Region: Vascular Plants of the San Juan River Drainage, Arizona, Colorado, New Mexico, and Utah (First ed.). St. Louis, Missouri: Missouri Botanical Garden. p. 909. ISBN 978-1-930723-84-9. ISSN 0161-1542. LCCN 2012949654. OCLC 859541992. Retrieved 27 July 2024. +
  14. +
  15. ^ Jump up to: a b c d e Lim, Tong Kwee (2012). "Malus x domestica". Edible Medicinal and Non-Medicinal Plants. Vol. 4, Fruit (First ed.). Dordrecht, the Netherlands: Springer. pp. 414–415. doi:10.1007/978-94-007-4053-2_49. ISBN 978-94-007-4053-2. OCLC 795503871. +
  16. +
  17. ^ Juniper, Barrie E.; Mabberley, David J. (2006). The Story of the Apple (First ed.). Portland, Oregon: Timber Press. p. 27. ISBN 978-0-88192-784-9. LCCN 2006011869. OCLC 67383484. Retrieved 1 August 2024. +
  18. +
  19. ^ "Fruit glossary". Royal Horticultural Society. Archived from the original on 7 August 2024. Retrieved 7 August 2024. +
  20. +
  21. ^ Burford, Tom (2013). Apples of North America : 192 Exceptional Varieties for Gardeners, Growers and Cooks (First ed.). Portland, Oregon: Timber Press. pp. 22, 50, 55, 122, 123, 137, 141, 147, 159, 245, 246. ISBN 978-1-60469-249-5. LCCN 2012045130. OCLC 819860825. +
  22. +
  23. ^ "Shape". Western Agricultural Research Center. Montana State University. Archived from the original on 23 April 2024. Retrieved 30 July 2024. +
  24. +
  25. ^ Jump up to: a b Janick, Jules; Cummins, James N.; Brown, Susan K.; Hemmat, Minou (1996). "Chapter 1: Apples" (PDF). Fruit Breeding. Vol. I: Tree and Tropical Fruits. New York: John Wiley & Sons. pp. 9, 48. ISBN 978-0-471-31014-3. LCCN 95016407. OCLC 1302621533. Archived (PDF) from the original on 19 July 2013. Retrieved 30 August 2024. +
  26. +
  27. ^ "Natural Waxes on Fruits". Postharvest.tfrec.wsu.edu. 29 October 2010. Archived from the original on 24 May 2013. Retrieved 14 June 2013. +
  28. +
  29. ^ Flath, R. A.; Black, D. R.; Forrey, R. R.; McDonald, G. M.; Mon, T. R.; Teranishi, R. (1 August 1969). "Volatiles in Gravenstein Apple Essence Identified by GC-Mass Spectrometry". Journal of Chromatographic Science. 7 (8): 508. doi:10.1093/CHROMSCI/7.8.508. +
  30. +
  31. ^ Flath, Robert A.; Black, Dale Robert.; Guadagni, Dante G.; McFadden, William H.; Schultz, Thomas H. (January 1967). "Identification and organoleptic evaluation of compounds in Delicious apple essence". Journal of Agricultural and Food Chemistry. 15 (1): 29. doi:10.1021/jf60149a032. +
  32. +
  33. ^ Jump up to: a b Qian, Guan-Ze; Liu, Lian-Fen; Tang, Geng-Guo (April 2010). "(1933) Proposal to conserve the name Malus domestica against M. pumila, M. communis, M. frutescens, and Pyrus dioica ( Rosaceae )". Taxon. 59 (2): 650–652. doi:10.1002/tax.592038. +
  34. +
  35. ^ Applequist, Wendy L. (2017). "Report of the Nomenclature Committee for Vascular Plants: 69" (PDF). Taxon. 66 (2): 500–513. doi:10.12705/662.17. Archived (PDF) from the original on 7 May 2024. +
  36. +
  37. ^ Wilson, Karen L. (June 2017). "Report of the General Committee: 18". Taxon. 66 (3): 742. doi:10.12705/663.15. +
  38. +
  39. ^ Jump up to: a b Velasco, Riccardo; Zharkikh, Andrey; Affourtit, Jason; Dhingra, Amit; Cestaro, Alessandro; et al. (2010). "The genome of the domesticated apple (Malus × domestica Borkh.)". Nature Genetics. 42 (10): 833–839. doi:10.1038/ng.654. PMID 20802477. S2CID 14854514. +
  40. +
  41. ^ Di Pierro, Erica A.; Gianfranceschi, Luca; Di Guardo, Mario; Koehorst-Van Putten, Herma J.J.; Kruisselbrink, Johannes W.; et al. (2016). "A high-density, multi-parental SNP genetic map on apple validates a new mapping approach for outcrossing species". Horticulture Research. 3 (1): 16057. Bibcode:2016HorR....316057D. doi:10.1038/hortres.2016.57. PMC 5120355. PMID 27917289. +
  42. +
  43. ^ Jump up to: a b Daccord, Nicolas; Celton, Jean-Marc; Linsmith, Gareth; et al. (2017). "High-quality de novo assembly of the apple genome and methylome dynamics of early fruit development". Nature Genetics. 49 (7). Nature Communications: 1099–1106. doi:10.1038/ng.3886. hdl:10449/42064. PMID 28581499. S2CID 24690391. +
  44. +
  45. ^ Jump up to: a b Zhang, Liyi; Hu, Jiang; Han, Xiaolei; Li, Jingjing; Gao, Yuan; et al. (2019). "A high-quality apple genome assembly reveals the association of a retrotransposon and red fruit colour". Nature Communications. 10 (1). Nature Genetics: 1494. Bibcode:2019NatCo..10.1494Z. doi:10.1038/s41467-019-09518-x. PMC 6445120. PMID 30940818. +
  46. +
  47. ^ Jump up to: a b c d e Duan, Naibin; Bai, Yang; Sun, Honghe; Wang, Nan; Ma, Yumin; et al. (2017). "Genome re-sequencing reveals the history of apple and supports a two-stage model for fruit enlargement". Nature Communications. 8 (1): 249. Bibcode:2017NatCo...8..249D. doi:10.1038/s41467-017-00336-7. PMC 5557836. PMID 28811498. +
  48. +
  49. ^ Richards, Christopher M.; Volk, Gayle M.; Reilley, Ann A.; Henk, Adam D.; Lockwood, Dale R.; et al. (2009). "Genetic diversity and population structure in Malus sieversii, a wild progenitor species of domesticated apple". Tree Genetics & Genomes. 5 (2): 339–347. doi:10.1007/s11295-008-0190-9. S2CID 19847067. +
  50. +
  51. ^ Lauri, Pierre-éric; Maguylo, Karen; Trottier, Catherine (March 2006). "Architecture and size relations: an essay on the apple (Malus × domestica, Rosaceae) tree". American Journal of Botany. 93 (3): 357–368. doi:10.3732/ajb.93.3.357. PMID 21646196. Archived from the original on 20 April 2019. Retrieved 27 July 2024. +
  52. +
  53. ^ Cornille, Amandine; Gladieux, Pierre; Smulders, Marinus J. M.; Roldán-Ruiz, Isabel; Laurens, François; et al. (2012). Mauricio, Rodney (ed.). "New Insight into the History of Domesticated Apple: Secondary Contribution of the European Wild Apple to the Genome of Cultivated Varieties". PLOS Genetics. 8 (5): e1002703. doi:10.1371/journal.pgen.1002703. PMC 3349737. PMID 22589740. +
  54. +
  55. ^ Kean, Sam (17 May 2012). "ScienceShot: The Secret History of the Domesticated Apple". Archived from the original on 11 June 2016. +
  56. +
  57. ^ Coart, E.; Van Glabeke, S.; De Loose, M.; Larsen, A.S.; Roldán-Ruiz, I. (2006). "Chloroplast diversity in the genus Malus: new insights into the relationship between the European wild apple (Malus sylvestris (L.) Mill.) and the domesticated apple (Malus domestica Borkh.)". Mol. Ecol. 15 (8): 2171–2182. Bibcode:2006MolEc..15.2171C. doi:10.1111/j.1365-294x.2006.02924.x. PMID 16780433. S2CID 31481730. +
  58. +
  59. ^ Rottoli, Mauro; Pessina, Andrea (2007). "Chapter 9: Neolithic agriculture in Italy: an update of archaeobotanical data with particular emphasis on northern settlements". In Colledge, Sue; Conolly, James (eds.). The Origins and Spread of Domestic Plants in Southwest Asia and Europe (First ed.). Walnut Creek, California: Left Coast Press; University College London Institute of Archaeology Publications. pp. 142–143. ISBN 978-1-59874-988-5. OCLC 84838157. +
  60. +
  61. ^ Jump up to: a b c d Schlumbaum, Angela; van Glabeke, Sabine; Roldan-Ruiz, Isabel (January 2012). "Towards the onset of fruit tree growing north of the Alps: Ancient DNA from waterlogged apple (Malus sp.) seed fragments". Annals of Anatomy - Anatomischer Anzeiger. 194 (1): 157–162. doi:10.1016/j.aanat.2011.03.004. PMID 21501956. +
  62. +
  63. ^ Sauer, Jonathan D. (1993). Historical Geography of Crop Plants: A Select Roster (First ed.). Boca Raton, Florida: CRC Press. pp. 109–113. ISBN 978-0-8493-8901-6. LCCN 92045590. OCLC 27224696. +
  64. +
  65. ^ Plinius, Gaius Secundus (1855). The Natural History of Pliny. Vol. III. Translated by Bostock, John; Riley, Henry T. London: Henry G. Bohn. p. 303. Retrieved 3 August 2024. +
  66. +
  67. ^ Martin, Alice A. (1976). All About Apples (First ed.). Boston, Massachusetts: Houghton Mifflin Company. pp. 64–65. ISBN 978-0-395-20724-6. OCLC 1733691. Retrieved 3 August 2024. +
  68. +
  69. ^ Adamson, Melitta Weiss (2004). Food in Medieval Times (First ed.). Westport, Connecticut: Greenwood Press. pp. 19–20. ISBN 978-0-313-32147-4. LCCN 2004014054. OCLC 55738647. +
  70. +
  71. ^ Torrejón, Fernando; Cisternas, Marco; Araneda, Alberto (2004). "Efectos ambientales de la colonización española desde el río Maullín al archipiélago de Chiloé, sur de Chile" [Environmental effects of the spanish colonization from de Maullín river to the Chiloé archipelago, southern Chile]. Revista Chilena de Historia Natural (in Spanish). 77 (4): 661–677. doi:10.4067/s0716-078x2004000400009. +
  72. +
  73. ^ Smith, Archibald William (1963). A Gardener's Book of Plant Names : A Handbook of the Meaning and Origins of Plant Names (First ed.). New York: Harper & Row. p. 40. LCCN 62009906. OCLC 710612. Retrieved 10 August 2024. +
  74. +
  75. ^ Jump up to: a b c Poole, Mike (1980). "Heirloom Apples". In Lawrence, James (ed.). The Harrowsmith Reader Volume II. Camden East, Ontario: Camden House Publishing. p. 122. ISBN 978-0-920656-11-2. OCLC 1336124440. Retrieved 10 August 2024. +
  76. +
  77. ^ Van Valen, James M. (1900). History of Bergen County, New Jersey. New York: New Jersey Publishing and Engraving Company. pp. 33–34. OCLC 25697876. Retrieved 9 August 2024. +
  78. +
  79. ^ Brox, Jane (1999). Five Thousand Days Like This One (First ed.). Boston, Massachusetts: Beacon Press. pp. 150–151. ISBN 978-0-8070-2106-4. LCCN 98035051. OCLC 39605684. Retrieved 9 August 2024. +
  80. +
  81. ^ Cohen, Rachel D. (26 November 2018). "Thanks To Science, You Can Eat An Apple Every Day". The Salt. NPR. Archived from the original on 18 June 2024. Retrieved 1 August 2024. +
  82. +
  83. ^ "The Heirloom Apple Orchard". The Jentsch Lab. Cornell University. Archived from the original on 30 July 2024. Retrieved 9 August 2024. +
  84. +
  85. ^ Ranney, Thomas G. "Polyploidy: From Evolution to Landscape Plant Improvement". Proceedings of the 11th Metropolitan Tree Improvement Alliance (METRIA) Conference. 11th Metropolitan Tree Improvement Alliance Conference held in Gresham, Oregon, August 23–24, 2000. METRIA (NCSU.edu). METRIA. Archived from the original on 23 July 2010. Retrieved 7 November 2010. +
  86. +
  87. ^ Lord, William G.; Ouellette, Amy (February 2010). "Dwarf Rootstocks for Apple Trees in the Home Garden" (PDF). University of New Hampshire. Archived from the original (PDF) on 30 September 2013. Retrieved 1 September 2013. +
  88. +
  89. ^ Fallahi, Esmaeil; Colt, W. Michael; Fallahi, Bahar; Chun, Ik-Jo (January 2002). "The Importance of Apple Rootstocks on Tree Growth, Yield, Fruit Quality, Leaf Nutrition, and Photosynthesis with an Emphasis on 'Fuji'". HortTechnology. 12 (1): 38–44. doi:10.21273/HORTTECH.12.1.38. Archived (PDF) from the original on 11 February 2014. Retrieved 9 August 2024. +
  90. +
  91. ^ Parker, M.L. (September 1993). "Apple Rootstocks and Tree Spacing". North Carolina Cooperative Extension Service. Archived from the original on 11 September 2013. Retrieved 1 September 2013. +
  92. +
  93. ^ Ferree, David Curtis; Warrington, Ian J. (2003). Apples: Botany, Production, and Uses. New York: Centre for Agriculture and Bioscience International. pp. 33–35. ISBN 978-0851995922. OCLC 133167834. +
  94. +
  95. ^ Jump up to: a b c d Polomski, Bob; Reighard, Greg. "Apple HGIC 1350". Home & Garden Information Center. Clemson University. Archived from the original on 28 February 2008. Retrieved 22 January 2008. +
  96. +
  97. ^ Barahona, M. (1992). "Adaptation of Apple Varieties in Ecuador". Acta Horticulturae (310): 135–142. doi:10.17660/ActaHortic.1992.310.17. +
  98. +
  99. ^ Adamson, Nancy Lee (2011). An Assessment of Non-Apis Bees as Fruit and Vegetable Crop Pollinators in Southwest Virginia (PDF) (Doctor of Philosophy in Entomology thesis). Virginia Polytechnic Institute and State University. Archived (PDF) from the original on 20 November 2015. Retrieved 15 October 2015. +
  100. +
  101. ^ Powell, L.E. (1986). "The Chilling Requirement in Apple and Its Role in Regulating Time of Flowering in Spring in Cold-Winter Climate". Acta Horticulturae (179). Wageningen, Netherlands: International Society for Horticultural Science: 129–140. doi:10.17660/ActaHortic.1986.179.10. ISBN 978-90-6605-182-9. +
  102. +
  103. ^ Romano, Andrea (10 September 2023). "20 Best Places to Go Apple Picking in the United States". Travel + Leisure. Archived from the original on 21 April 2024. Retrieved 2 August 2024. +
  104. +
  105. ^ Graziano, Jack; Farcuh, Macarena (10 September 2021). "Controlled Atmosphere Storage of Apples". University of Maryland Extension. Archived from the original on 24 March 2023. Retrieved 2 August 2024. +
  106. +
  107. ^ "FoodKeeper App". FoodSafety.gov. United States Department of Health and Human Services. 26 April 2019. Retrieved 17 September 2024. +
  108. +
  109. ^ "4 Steps to Food Safety". FoodSafety.gov. United States Department of Health and Human Services. 12 April 2019. Retrieved 17 September 2024. +
  110. +
  111. ^ "Refrigerated storage of perishable foods". CSIRO. 26 February 2015. Archived from the original on 15 March 2015. Retrieved 25 May 2007. +
  112. +
  113. ^ Karp, David (25 October 2006). "Puff the Magic Preservative: Lasting Crunch, but Less Scent". The New York Times. Archived from the original on 3 August 2011. Retrieved 26 July 2017. +
  114. +
  115. ^ Jackson, H.S. (1914). "Powdery Mildew". In Lowther, Granville; Worthington, William (eds.). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables. Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 475–476. Retrieved 1 August 2024. +
  116. +
  117. ^ Lowther, Granville; Worthington, William, eds. (1914). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables. Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 45–51. Retrieved 1 August 2024. +
  118. +
  119. ^ Coli, William M.; Los, Lorraine M., eds. (2003). "Insect Pests". 2003-2004 New England Apple Pest Management Guide. University of Massachusetts Amherst. pp. 28–29. Archived from the original on 12 February 2008. Retrieved 3 March 2008.{{cite book}}: CS1 maint: bot: original URL status unknown (link) +
  120. +
  121. ^ Jump up to: a b Atthowe, Helen; Gilkeson, Linda A.; Kite, L. Patricia; Michalak, Patricia S.; Pleasant, Barbara; Reich, Lee; Scheider, Alfred F. (2009). Bradley, Fern Marshall; Ellis, Bardara W.; Martin, Deborah L. (eds.). The Organic Gardener's Handbook of Natural Pest and Disease Control. New York: Rodale, Inc. pp. 32–34. ISBN 978-1-60529-677-7. LCCN 2009039996. OCLC 419860680. +
  122. +
  123. ^ Coli, William M.; Berkett, Lorraine P.; Spitko, Robin, eds. (2003). "Other Apple Diseases". 2003-2004 New England Apple Pest Management Guide. University of Massachusetts Amherst. pp. 19–27. Archived from the original on 12 February 2008. Retrieved 3 March 2008.{{cite book}}: CS1 maint: bot: original URL status unknown (link) +
  124. +
  125. ^ Martin, Phillip L.; Krawczyk, Teresa; Khodadadi, Fatemeh; Aćimović, Srđan G.; Peter, Kari A. (2021). "Bitter Rot of Apple in the Mid-Atlantic United States: Causal Species and Evaluation of the Impacts of Regional Weather Patterns and Cultivar Susceptibility". Phytopathology. 111 (6): 966–981. doi:10.1094/PHYTO-09-20-0432-R. ISSN 0031-949X. PMID 33487025. S2CID 231701083. +
  126. +
  127. ^ Erler, Fedai (1 January 2010). "Efficacy of tree trunk coating materials in the control of the apple clearwing, Synanthedon myopaeformis". Journal of Insect Science. 10 (1): 63. doi:10.1673/031.010.6301. PMC 3014806. PMID 20672979. +
  128. +
  129. ^ Elzebroek, A. T. G.; Wind, Koop (2008). Guide to Cultivated Plants. Wallingford, United Kingdom: CABI. p. 27. ISBN 978-1-84593-356-2. LCCN 2007028459. OCLC 156975183. Archived from the original on 20 October 2020. Retrieved 6 October 2020. +
  130. +
  131. ^ Jump up to: a b "Apple – Malus domestica". Natural England. Archived from the original on 12 May 2008. Retrieved 22 January 2008. +
  132. +
  133. ^ "Home". National Fruit Collection. Archived from the original on 15 June 2012. Retrieved 2 December 2012. +
  134. +
  135. ^ "ECPGR Malus/Pyrus Working Group Members". Ecpgr.cgiar.org. 22 July 2002. Archived from the original on 26 August 2014. Retrieved 25 August 2014. +
  136. +
  137. ^ Jump up to: a b Tarjan, Sue (Fall 2006). "Autumn Apple Musings" (PDF). News & Notes of the UCSC Farm & Garden, Center for Agroecology & Sustainable Food Systems. pp. 1–2. Archived from the original (PDF) on 11 August 2007. Retrieved 24 January 2008. +
  138. +
  139. ^ Beck, Kellen (17 October 2020). "How breeders bring out the best in new apples". Mashable. Archived from the original on 31 July 2024. Retrieved 31 July 2024. +
  140. +
  141. ^ Migicovsky, Zoë (22 August 2021). "How a few good apples spawned today's top varieties — and why breeders must branch out". The Conversation. Archived from the original on 31 July 2024. Retrieved 31 July 2024. +
  142. +
  143. ^ Peil, A.; Dunemann, F.; Richter, K.; Hoefer, M.; Király, I.; Flachowsky, H.; Hanke, M.-V. (2008). "Resistance Breeding in Apple at Dresden-Pillnitz". Ecofruit - 13th International Conference on Cultivation Technique and Phytopathological Problems in Organic Fruit-Growing: Proceedings to the Conference from 18thFebruary to 20th February 2008 at Weinsberg/Germany (in German): 220–225. Archived from the original on 28 January 2021. Retrieved 31 July 2024. +
  144. +
  145. ^ Jump up to: a b "World apple situation". Archived from the original on 11 February 2008. Retrieved 24 January 2008. +
  146. +
  147. ^ Weaver, Sue (June–July 2003). "Crops & Gardening – Apples of Antiquity". Hobby Farms Magazine. Archived from the original on 19 February 2017. +
  148. +
  149. ^ Jump up to: a b c "Apple production in 2022; from pick lists: Crops/World Regions/Production Quantity". FAOSTAT, UN Food and Agriculture Organization, Statistics Division. 2024. Archived from the original on 12 November 2016. Retrieved 18 June 2024. +
  150. +
  151. ^ Nelson, Lewis S.; Shih, Richard D.; Balick, Michael J. (2007). Handbook of Poisonous and Injurious Plants (Second ed.). New York: New York Botanical Garden : Springer. pp. 27, 211–212. ISBN 978-0387-31268-2. LCCN 2005938815. OCLC 77537459. Retrieved 11 September 2024. +
  152. +
  153. ^ "Amygdalin". Toxnet, US Library of Medicine. Archived from the original on 21 April 2017. Retrieved 20 April 2017. +
  154. +
  155. ^ Jump up to: a b c d e f "General Information – Apple". Informall. Archived from the original on 23 July 2012. Retrieved 17 October 2011. +
  156. +
  157. ^ Landau, Elizabeth, Oral allergy syndrome may explain mysterious reactions, 8 April 2009, CNN Health, accessed 17 October 2011 +
  158. +
  159. ^ United States Food and Drug Administration (2024). "Daily Value on the Nutrition and Supplement Facts Labels". FDA. Archived from the original on 27 March 2024. Retrieved 28 March 2024. +
  160. +
  161. ^ National Academies of Sciences, Engineering, and Medicine; Health and Medicine Division; Food and Nutrition Board; Committee to Review the Dietary Reference Intakes for Sodium and Potassium (2019). Oria, Maria; Harrison, Meghan; Stallings, Virginia A. (eds.). Dietary Reference Intakes for Sodium and Potassium. The National Academies Collection: Reports funded by National Institutes of Health. Washington, DC: National Academies Press (US). ISBN 978-0-309-48834-1. PMID 30844154. Archived from the original on 9 May 2024. Retrieved 21 June 2024. +
  162. +
  163. ^ Jump up to: a b c d Davidson, Alan (2014). "Apple". In Jaine, Tom (ed.). The Oxford Companion to Food. Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press. pp. 27–31. ISBN 978-0-19-967733-7. LCCN 2013957569. OCLC 890807357. OL 27172691M. Retrieved 18 September 2024. +
  164. +
  165. ^ Traverso, Amy (2011). The Apple Lover's Cookbook. Photographs by Squire Fox (First ed.). New York: W.W. Norton & Company. pp. 16, 32, 35, 45, 92, 137, 262–263, 275. ISBN 978-0-393-06599-2. LCCN 2011016560. OCLC 711051767. OL 16450839W. +
  166. +
  167. ^ Kellogg, Kristi (15 January 2015). "81 Best Apple Recipes: Dinners, Desserts, Salads, and More". Epicurious. Archived from the original on 18 October 2020. Retrieved 17 October 2020. +
  168. +
  169. ^ Davidson, Alan (2014). "Toffee Apple". In Jaine, Tom (ed.). The Oxford Companion to Food. Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press. p. 824. ISBN 978-0-19-967733-7. LCCN 2013957569. OCLC 890807357. OL 27172691M. Retrieved 18 September 2024. +
  170. +
  171. ^ Shurpin, Yehuda. "Why All the Symbolic Rosh Hashanah Foods? "בולבול"". Chabad.org. Archived from the original on 21 March 2023. Retrieved 21 March 2023. +
  172. +
  173. ^ Yepsen, Roger B. (2017) [1994]. Apples (Revised and Updated ed.). New York: W.W. Norton & Company. p. 52. ISBN 978-1-68268-019-3. LCCN 2017010136. OCLC 973918728. +
  174. +
  175. ^ "Organic apples". USDA Agricultural Marketing Service. February 2016. Archived from the original on 24 February 2017. Retrieved 23 February 2017. +
  176. +
  177. ^ Jump up to: a b "European Organic Apple Production Demonstrates the Value of Pesticides" (PDF). CropLife Foundation, Washington, DC. December 2011. Archived (PDF) from the original on 24 February 2017. Retrieved 23 February 2017. +
  178. +
  179. ^ Ribeiro, Flávia A.P.; Gomes de Moura, Carolina F.; Aguiar, Odair; de Oliveira, Flavia; Spadari, Regina C.; Oliveira, Nara R.C.; Oshima, Celina T.F.; Ribeiro, Daniel A. (September 2014). "The chemopreventive activity of apple against carcinogenesis: antioxidant activity and cell cycle control". European Journal of Cancer Prevention (Review). 23 (5): 477–480. doi:10.1097/CEJ.0000000000000005. PMID 24366437. S2CID 23026644. +
  180. +
  181. ^ Nicolas, J. J.; Richard-Forget, F. C.; Goupy, P. M.; Amiot, M. J.; Aubert, S. Y. (1 January 1994). "Enzymatic browning reactions in apple and apple products". Critical Reviews in Food Science and Nutrition. 34 (2): 109–157. doi:10.1080/10408399409527653. PMID 8011143. +
  182. +
  183. ^ "PPO silencing". Okanagan Specialty Fruits. 2019. Archived from the original on 27 April 2021. Retrieved 14 November 2019. +
  184. +
  185. ^ "United States: GM non-browning Arctic apple expands into foodservice". Fresh Fruit Portal. 13 August 2019. Archived from the original on 27 June 2021. Retrieved 14 November 2019. +
  186. +
  187. ^ "Okanagan Specialty Fruits: Biotechnology Consultation Agency Response Letter BNF 000132". U.S. Food and Drug Administration. 20 March 2015. Archived from the original on 31 October 2017. Retrieved 14 November 2019. +
  188. +
  189. ^ "Questions and answers: Arctic Apple". Canadian Food Inspection Agency, Government of Canada. 8 September 2017. Archived from the original on 19 September 2018. Retrieved 14 November 2019. +
  190. +
  191. ^ Yu, Xiuzhu; Van De Voort, Frederick R.; Li, Zhixi; Yue, Tianli (2007). "Proximate Composition of the Apple Seed and Characterization of Its Oil". International Journal of Food Engineering. 3 (5). doi:10.2202/1556-3758.1283. S2CID 98590230. +
  192. +
  193. ^ Jump up to: a b c Davidson, Hilda Roderick Ellis (1990) [1st pub. 1964]. Gods and Myths of Northern Europe. London: Penguin Books. pp. 165–166. ISBN 0-14-013627-4. OCLC 29336401. +
  194. +
  195. ^ Davidson, Hilda Ellis (1998). Roles of the Northern Goddess. London; New York: Routledge. pp. 146–147. doi:10.4324/9780203025550. ISBN 0-415-13610-5. LCCN 97018309. OCLC 48138055. +
  196. +
  197. ^ Biedermann, Hans (1992). Dictionary of Symbolism. Translated by Hulbert, James. New York: Facts on File. pp. 16–17. ISBN 978-0-8160-2593-0. LCCN 91044933. OCLC 25092926. Retrieved 3 October 2024. +
  198. +
  199. ^ Jump up to: a b Ruck, Carl A. P.; Staples, Blaise D.; Heinrich, Clark (2001). The apples of Apollo : pagan and Christian mysteries of the Eucharist. Durham, North Carolina: Carolina Academic Press. pp. 64–70. ISBN 978-0-89089-924-3. LCCN 00040351. OCLC 46337324. +
  200. +
  201. ^ "Eris - Greek Goddess of Strife & Discord (Roman Discordia)". Theoi Project. Aaron J. Atsma. Archived from the original on 25 September 2024. Retrieved 26 September 2024. +
  202. +
  203. ^ Lucian (1905). The Works of Lucian of Samosata. Vol. I. Translated by Fowler, H.W.; Fowler, F.G. (First ed.). Oxford: Clarendon Press. pp. 78–85. LCCN 06001045. OCLC 506365. Retrieved 26 September 2024. +
  204. +
  205. ^ "Judgement of Paris - Greek Mythology". Theoi Project. Aaron J. Atsma. Archived from the original on 24 August 2024. Retrieved 26 September 2024. +
  206. +
  207. ^ Plato (1997). "Epigrams". In Cooper, John M.; Hutchinson, D.S. (eds.). Complete Works. Translated by Edmonds, J.M.; Cooper, John M. Indianapolis, Indiana: Hackett Publishing. p. 1744. ISBN 0-87220-349-2. LCCN 96053280. OCLC 36178550. Retrieved 27 September 2024. +
  208. +
  209. ^ Pinsent, John (1969). Greek Mythology (First ed.). London: Paul Hamlyn. p. 79. ISBN 978-0-600-02422-4. LCCN 78449216. OCLC 61702. Retrieved 3 October 2024. +
  210. +
  211. ^ "Atalanta (Atalante) - Arcadian Heroine of Greek Mythology". Theoi Project. Aaron J. Atsma. Archived from the original on 27 September 2024. Retrieved 3 October 2024. +
  212. +
  213. ^ Flieger, Verlyn (2005). Interrupted Music : The Making of Tolkien's Mythology. Kent, Ohio: Kent State University Press. pp. 122–123. ISBN 978-0-87338-824-5. LCCN 2004024490. OCLC 56805947. +
  214. +
  215. ^ Jump up to: a b "Why Do the Chinese Give Apples Around Christmas?". Teach English In China. 22 December 2019. Archived from the original on 1 October 2020. Retrieved 3 September 2024. +
  216. +
  217. ^ Jump up to: a b Macrone, Michael (1998). Brush up your Bible!. New York: Gramercy Books. pp. 15–16, 340–341. ISBN 978-0-517-20189-3. OCLC 38270894. Retrieved 31 July 2024. +
  218. +
  219. ^ Kissling, Paul J. (2004). Genesis. Vol. 1. Joplin, Missouri: College Press. p. 193. ISBN 978-0-89900-875-2. LCCN 2004022577. OCLC 56672257. Archived from the original on 26 January 2021. Retrieved 6 October 2020. +
  220. +
  221. ^ Genesis 2:17 +
  222. +
  223. ^ Hendel, Ronald S. (2013). The Book of Genesis: A Biography. Princeton, New Jersey: Princeton University Press. p. 114. ISBN 978-0-69114012-4. LCCN 2012015634. OCLC 788265521. Archived from the original on 5 March 2023. Retrieved 4 October 2024. +
  224. +
  225. ^ Mieder, Wolfgang; Kingsbury, Stewart A.; Harder, Kelsie B., eds. (1996) [1992]. A Dictionary of American Proverbs (Paperback ed.). New York: Oxford University Press. p. 23. ISBN 978-0-19-511133-0. LCCN 91015508. OCLC 23693799. Retrieved 23 August 2024. +
  226. +
  227. ^ Pollan, Michael (2001). The Botany of Desire: A Plant's-Eye View of the World (First ed.). New York: Random House. pp. 9, 22, 50. ISBN 978-0-375-50129-6. LCCN 00066479. OCLC 49803415. +
  228. +
+

Further reading

+ +
+
  • Media related to Apples at Wikimedia Commons
+ + + + + + + + + + + +
+
+ +
+
+ +
+ +
+
+
+
    +
    + + +
    \ No newline at end of file diff --git a/tests/async/test_content_scraper_strategy.py b/tests/async/test_content_scraper_strategy.py new file mode 100644 index 00000000..5dfa6362 --- /dev/null +++ b/tests/async/test_content_scraper_strategy.py @@ -0,0 +1,162 @@ +import asyncio +from bs4 import BeautifulSoup +from typing import Dict, Any +import os +import sys +import time +import csv +from tabulate import tabulate +from dataclasses import dataclass +from typing import List, Dict + +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +from crawl4ai.content_scrapping_strategy import WebScrapingStrategy +from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent +# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent + +@dataclass +class TestResult: + name: str + success: bool + images: int + internal_links: int + external_links: int + markdown_length: int + execution_time: float + +class StrategyTester: + def __init__(self): + self.new_scraper = WebScrapingStrategy() + self.current_scraper = WebScrapingStrategyCurrent() + with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f: + self.WIKI_HTML = f.read() + self.results = {'new': [], 'current': []} + + def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]: + results = [] + for scraper in [self.new_scraper, self.current_scraper]: + start_time = time.time() + result = scraper._get_content_of_website_optimized( + url="https://en.wikipedia.org/wiki/Test", + html=self.WIKI_HTML, + **kwargs + ) + execution_time = time.time() - start_time + + test_result = TestResult( + name=name, + success=result['success'], + images=len(result['media']['images']), + internal_links=len(result['links']['internal']), + external_links=len(result['links']['external']), + markdown_length=len(result['markdown']), + execution_time=execution_time + ) + results.append(test_result) + + return results[0], results[1] # new, current + + def run_all_tests(self): + test_cases = [ + ("Basic Extraction", {}), + ("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}), + ("Word Threshold", {'word_count_threshold': 50}), + ("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}), + ("Link Exclusions", { + 'exclude_external_links': True, + 'exclude_social_media_links': True, + 'exclude_domains': ['facebook.com', 'twitter.com'] + }), + ("Media Handling", { + 'exclude_external_images': True, + 'image_description_min_word_threshold': 20 + }), + ("Text Only", { + 'only_text': True, + 'remove_forms': True + }), + ("HTML Cleaning", { + 'clean_html': True, + 'keep_data_attributes': True + }), + ("HTML2Text Options", { + 'html2text': { + 'skip_internal_links': True, + 'single_line_break': True, + 'mark_code': True, + 'preserve_tags': ['pre', 'code'] + } + }) + ] + + all_results = [] + for name, kwargs in test_cases: + try: + new_result, current_result = self.run_test(name, **kwargs) + all_results.append((name, new_result, current_result)) + except Exception as e: + print(f"Error in {name}: {str(e)}") + + self.save_results_to_csv(all_results) + self.print_comparison_table(all_results) + + def save_results_to_csv(self, all_results: List[tuple]): + csv_file = os.path.join(__location__, 'strategy_comparison_results.csv') + with open(csv_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', + 'External Links', 'Markdown Length', 'Execution Time']) + + for name, new_result, current_result in all_results: + writer.writerow([name, 'New', new_result.success, new_result.images, + new_result.internal_links, new_result.external_links, + new_result.markdown_length, f"{new_result.execution_time:.3f}"]) + writer.writerow([name, 'Current', current_result.success, current_result.images, + current_result.internal_links, current_result.external_links, + current_result.markdown_length, f"{current_result.execution_time:.3f}"]) + + def print_comparison_table(self, all_results: List[tuple]): + table_data = [] + headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', + 'External Links', 'Markdown Length', 'Time (s)'] + + for name, new_result, current_result in all_results: + # Check for differences + differences = [] + if new_result.images != current_result.images: differences.append('images') + if new_result.internal_links != current_result.internal_links: differences.append('internal_links') + if new_result.external_links != current_result.external_links: differences.append('external_links') + if new_result.markdown_length != current_result.markdown_length: differences.append('markdown') + + # Add row for new strategy + new_row = [ + name, 'New', new_result.success, new_result.images, + new_result.internal_links, new_result.external_links, + new_result.markdown_length, f"{new_result.execution_time:.3f}" + ] + table_data.append(new_row) + + # Add row for current strategy + current_row = [ + '', 'Current', current_result.success, current_result.images, + current_result.internal_links, current_result.external_links, + current_result.markdown_length, f"{current_result.execution_time:.3f}" + ] + table_data.append(current_row) + + # Add difference summary if any + if differences: + table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', '']) + + # Add empty row for better readability + table_data.append([''] * len(headers)) + + print("\nStrategy Comparison Results:") + print(tabulate(table_data, headers=headers, tablefmt='grid')) + +if __name__ == "__main__": + tester = StrategyTester() + tester.run_all_tests() \ No newline at end of file From 17913f5acf28cfac775085b74496d1ed5aafcae6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 13 Nov 2024 20:00:29 +0800 Subject: [PATCH 05/50] feat(crawler): support local files and raw HTML input in AsyncWebCrawler --- crawl4ai/async_webcrawler.py | 49 +++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 9d0340dc..8415f9b9 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -104,6 +104,10 @@ class AsyncWebCrawler: extracted_content = None is_web_url = url.startswith(('http://', 'https://')) + is_local_file = url.startswith("file://") + is_raw_html = url.startswith("raw:") + _url = url if not is_raw_html else "Raw HTML" + if is_web_url and not bypass_cache and not self.always_by_pass_cache: cached = await async_db_manager.aget_cached_url(url) @@ -131,7 +135,7 @@ class AsyncWebCrawler: t2 = time.time() if verbose: print( - f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" + f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" ) crawl_result = await self.aprocess_html( @@ -147,6 +151,9 @@ class AsyncWebCrawler: is_cached=bool(cached), async_response=async_response, bypass_cache=bypass_cache, + is_web_url = is_web_url, + is_local_file = is_local_file, + is_raw_html = is_raw_html, **kwargs, ) @@ -164,8 +171,8 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}") - return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg) + print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") + return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) async def arun_many( self, @@ -233,6 +240,7 @@ class AsyncWebCrawler: t = time.time() # Extract content from HTML try: + _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.time() scrapping_strategy = WebScrapingStrategy() # result = await scrapping_strategy.ascrap( @@ -249,7 +257,7 @@ class AsyncWebCrawler: ) if verbose: print( - f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds" + f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds" ) if result is None: @@ -270,7 +278,7 @@ class AsyncWebCrawler: if extracted_content is None and extraction_strategy and chunking_strategy: if verbose: print( - f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}" + f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}" ) # Check if extraction strategy is type of JsonCssExtractionStrategy @@ -285,7 +293,7 @@ class AsyncWebCrawler: if verbose: print( - f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds." + f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds." ) screenshot = None if not screenshot else screenshot @@ -296,20 +304,21 @@ class AsyncWebCrawler: response_headers = json.dumps(async_response.response_headers, ensure_ascii=False) - if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - response_headers=response_headers, - ) + if not kwargs.get("is_raw_html", False): + if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url( + url, + html, + cleaned_html, + markdown, + extracted_content, + True, + json.dumps(media), + json.dumps(links), + json.dumps(metadata), + screenshot=screenshot, + response_headers=response_headers, + ) return CrawlResult( url=url, From 3d00fee6c28e16556c7a51035586faad7f5e1639 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 14 Nov 2024 22:50:59 +0800 Subject: [PATCH 06/50] - In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. - Another thing this commit introduces is the concept of the Relevance Content Filter. This is an improvement over Fit Markdown. This class of strategies aims to extract the main content from a given page - the part that really matters and is useful to be processed. One strategy has been created using the BM25 algorithm, which finds chunks of text from the web page relevant to its title, descriptions, and keywords, or supports a given user query and matches them. The result is then returned to the main engine to be converted to Markdown. Plans include adding approaches using language models as well. - The cache database was updated to hold information about response headers and downloaded files. --- crawl4ai/async_crawler_strategy.py | 63 +- crawl4ai/async_crawler_strategy_0.3.73.py | 965 ---------------------- crawl4ai/async_database.py | 22 +- crawl4ai/async_webcrawler.py | 47 +- crawl4ai/content_cleaning_strategy.py | 198 ----- crawl4ai/content_filter_strategy.py | 344 ++++++++ crawl4ai/content_scrapping_strategy.py | 14 +- crawl4ai/models.py | 18 +- crawl4ai/utils.py | 55 ++ tests/async/test_async_doanloader.py | 229 +++++ 10 files changed, 739 insertions(+), 1216 deletions(-) delete mode 100644 crawl4ai/async_crawler_strategy_0.3.73.py delete mode 100644 crawl4ai/content_cleaning_strategy.py create mode 100644 crawl4ai/content_filter_strategy.py create mode 100644 tests/async/test_async_doanloader.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index baa06e47..83933a35 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -14,6 +14,7 @@ from pydantic import BaseModel import hashlib import json import uuid +from .models import AsyncCrawlResponse from playwright_stealth import StealthConfig, stealth_async @@ -148,15 +149,6 @@ class ManagedBrowser: except Exception as e: print(f"Error removing temporary directory: {e}") -class AsyncCrawlResponse(BaseModel): - html: str - response_headers: Dict[str, str] - status_code: int - screenshot: Optional[str] = None - get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None - - class Config: - arbitrary_types_allowed = True class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -215,6 +207,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_retrieve_html': None } self.extra_args = kwargs.get("extra_args", []) + self.accept_downloads = kwargs.get("accept_downloads", False) + self.downloads_path = kwargs.get("downloads_path") + self._downloaded_files = [] # Track downloaded files for current crawl + if self.accept_downloads and not self.downloads_path: + self.downloads_path = os.path.join(os.getcwd(), "downloads") + os.makedirs(self.downloads_path, exist_ok=True) + async def __aenter__(self): await self.start() @@ -250,7 +249,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Set up the default context if self.default_context: await self.default_context.set_extra_http_headers(self.headers) - + if self.accept_downloads: + await self.default_context.set_default_timeout(60000) + await self.default_context.set_default_navigation_timeout(60000) + self.default_context._impl_obj._options["accept_downloads"] = True + self.default_context._impl_obj._options["downloads_path"] = self.downloads_path + if self.user_agent: await self.default_context.set_extra_http_headers({ "User-Agent": self.user_agent @@ -301,12 +305,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_persistent_context and self.user_data_dir: self.browser = await self.playwright.chromium.launch_persistent_context( user_data_dir=self.user_data_dir, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None, **browser_args ) self.default_context = self.browser else: self.browser = await self.playwright.chromium.launch(**browser_args) - + except Exception as e: # Fallback to chromium if Chrome channel fails if "chrome" in str(e) and browser_args.get("channel") == "chrome": @@ -565,6 +571,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers = {} status_code = None + # Reset downloaded files list for new crawl + self._downloaded_files = [] + self._cleanup_expired_sessions() session_id = kwargs.get("session_id") @@ -592,10 +601,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Normal context creation for non-persistent or non-Chrome browsers context = await self.browser.new_context( user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, + viewport={"width": 1200, "height": 800}, proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True + java_script_enabled=True, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) @@ -655,6 +665,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) try: + # Set up download handling if enabled + if self.accept_downloads: + page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) + if self.verbose: print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") @@ -886,7 +900,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, - get_delayed_content=get_delayed_content + get_delayed_content=get_delayed_content, + downloaded_files=self._downloaded_files if self._downloaded_files else None ) return response except Error as e: @@ -896,6 +911,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.close() # await context.close() + async def _handle_download(self, download): + """Handle file downloads.""" + try: + suggested_filename = download.suggested_filename + download_path = os.path.join(self.downloads_path, suggested_filename) + + if self.verbose: + print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}") + + await download.save_as(download_path) + self._downloaded_files.append(download_path) + + if self.verbose: + print(f"[LOG] ✅ Downloaded {suggested_filename} successfully") + except Exception as e: + if self.verbose: + print(f"[ERROR] Failed to handle download: {str(e)}") + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed semaphore = asyncio.Semaphore(semaphore_count) diff --git a/crawl4ai/async_crawler_strategy_0.3.73.py b/crawl4ai/async_crawler_strategy_0.3.73.py deleted file mode 100644 index 54835dad..00000000 --- a/crawl4ai/async_crawler_strategy_0.3.73.py +++ /dev/null @@ -1,965 +0,0 @@ -import asyncio -import base64 -import time -from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable -import os, sys, shutil -import tempfile, subprocess -from playwright.async_api import async_playwright, Page, Browser, Error -from io import BytesIO -from PIL import Image, ImageDraw, ImageFont -from pathlib import Path -from playwright.async_api import ProxySettings -from pydantic import BaseModel -import hashlib -import json -import uuid - -from playwright_stealth import StealthConfig, stealth_async - -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) - - -class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): - self.browser_type = browser_type - self.user_data_dir = user_data_dir - self.headless = headless - self.browser_process = None - self.temp_dir = None - self.debugging_port = 9222 - - async def start(self) -> str: - """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. - """ - - # Create temp dir if needed - if not self.user_data_dir: - self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") - self.user_data_dir = self.temp_dir - - # Get browser path and args based on OS and browser type - browser_path = self._get_browser_path() - args = self._get_browser_args() - - # Start browser process - try: - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - await asyncio.sleep(2) # Give browser time to start - return f"http://localhost:{self.debugging_port}" - except Exception as e: - await self.cleanup() - raise Exception(f"Failed to start browser: {e}") - - def _get_browser_path(self) -> str: - """Returns the browser executable path based on OS and browser type""" - if sys.platform == "darwin": # macOS - paths = { - "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" - } - elif sys.platform == "win32": # Windows - paths = { - "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None # WebKit not supported on Windows - } - else: # Linux - paths = { - "chromium": "google-chrome", - "firefox": "firefox", - "webkit": None # WebKit not supported on Linux - } - - return paths.get(self.browser_type) - - def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [self._get_browser_path()] - - if self.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.debugging_port}", - f"--user-data-dir={self.user_data_dir}", - ] - if self.headless: - args.append("--headless=new") - elif self.browser_type == "firefox": - args = [ - "--remote-debugging-port", str(self.debugging_port), - "--profile", self.user_data_dir, - ] - if self.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args - - async def cleanup(self): - """Cleanup browser process and temporary directory""" - if self.browser_process: - try: - self.browser_process.terminate() - await asyncio.sleep(1) - if self.browser_process.poll() is None: - self.browser_process.kill() - except Exception as e: - print(f"Error terminating browser: {e}") - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - print(f"Error removing temporary directory: {e}") - -class AsyncCrawlResponse(BaseModel): - html: str - response_headers: Dict[str, str] - status_code: int - screenshot: Optional[str] = None - get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None - - class Config: - arbitrary_types_allowed = True - -class AsyncCrawlerStrategy(ABC): - @abstractmethod - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - pass - - @abstractmethod - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - pass - - @abstractmethod - async def take_screenshot(self, **kwargs) -> str: - pass - - @abstractmethod - def update_user_agent(self, user_agent: str): - pass - - @abstractmethod - def set_hook(self, hook_type: str, hook: Callable): - pass - -class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): - self.use_cached_html = use_cached_html - self.user_agent = kwargs.get( - "user_agent", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - ) - self.proxy = kwargs.get("proxy") - self.proxy_config = kwargs.get("proxy_config") - self.headless = kwargs.get("headless", True) - self.browser_type = kwargs.get("browser_type", "chromium") - self.headers = kwargs.get("headers", {}) - self.sessions = {} - self.session_ttl = 1800 - self.js_code = js_code - self.verbose = kwargs.get("verbose", False) - self.playwright = None - self.browser = None - self.sleep_on_close = kwargs.get("sleep_on_close", False) - self.use_managed_browser = kwargs.get("use_managed_browser", False) - self.user_data_dir = kwargs.get("user_data_dir", None) - self.use_persistent_context = kwargs.get("use_persistent_context", False) - self.chrome_channel = kwargs.get("chrome_channel", "chrome") - self.managed_browser = None - self.default_context = None - self.hooks = { - 'on_browser_created': None, - 'on_user_agent_updated': None, - 'on_execution_started': None, - 'before_goto': None, - 'after_goto': None, - 'before_return_html': None, - 'before_retrieve_html': None - } - self.extra_args = kwargs.get("extra_args", []) - - async def __aenter__(self): - await self.start() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.close() - - async def start(self): - if self.playwright is None: - self.playwright = await async_playwright().start() - if self.browser is None: - if self.use_managed_browser: - # Use managed browser approach - self.managed_browser = ManagedBrowser( - browser_type=self.browser_type, - user_data_dir=self.user_data_dir, - headless=self.headless - ) - cdp_url = await self.managed_browser.start() - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - - # Get the default context that maintains the user profile - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - # If no default context exists, create one - self.default_context = await self.browser.new_context( - viewport={"width": 1920, "height": 1080} - ) - - # Set up the default context - if self.default_context: - await self.default_context.set_extra_http_headers(self.headers) - - if self.user_agent: - await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent - }) - else: - browser_args = { - "headless": self.headless, - "args": [ - "--disable-gpu", - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - # "--disable-http2", - # "--headless=new", # Use the new headless mode - ] - } - - # Add extra args if provided - if self.extra_args: - browser_args["args"].extend(self.extra_args) - - # Add proxy settings if a proxy is specified - if self.proxy: - proxy_settings = ProxySettings(server=self.proxy) - browser_args["proxy"] = proxy_settings - elif self.proxy_config: - proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) - browser_args["proxy"] = proxy_settings - - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - - # Update the headless configuration - if self.headless: - # Use the new headless mode explicitly - browser_args["args"].append("--headless=new") - - await self.execute_hook('on_browser_created', self.browser) - - async def close(self): - if self.sleep_on_close: - await asyncio.sleep(0.5) - - # Close all active sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self.kill_session(session_id) - - if self.browser: - await self.browser.close() - self.browser = None - - if self.managed_browser: - await self.managed_browser.cleanup() - self.managed_browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - - def __del__(self): - if self.browser or self.playwright: - asyncio.get_event_loop().run_until_complete(self.close()) - - def set_hook(self, hook_type: str, hook: Callable): - if hook_type in self.hooks: - self.hooks[hook_type] = hook - else: - raise ValueError(f"Invalid hook type: {hook_type}") - - async def execute_hook(self, hook_type: str, *args): - hook = self.hooks.get(hook_type) - if hook: - if asyncio.iscoroutinefunction(hook): - return await hook(*args) - else: - return hook(*args) - return args[0] if args else None - - def update_user_agent(self, user_agent: str): - self.user_agent = user_agent - - def set_custom_headers(self, headers: Dict[str, str]): - self.headers = headers - - async def kill_session(self, session_id: str): - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - if not self.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): - current_time = time.time() - expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) - - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): - wait_for = wait_for.strip() - - if wait_for.startswith('js:'): - # Explicitly specified JavaScript - js_code = wait_for[3:].strip() - return await self.csp_compliant_wait(page, js_code, timeout) - elif wait_for.startswith('css:'): - # Explicitly specified CSS selector - css_selector = wait_for[4:].strip() - try: - await page.wait_for_selector(css_selector, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") - else: - raise ValueError(f"Invalid CSS selector: '{css_selector}'") - else: - # Auto-detect based on content - if wait_for.startswith('()') or wait_for.startswith('function'): - # It's likely a JavaScript function - return await self.csp_compliant_wait(page, wait_for, timeout) - else: - # Assume it's a CSS selector first - try: - await page.wait_for_selector(wait_for, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") - else: - # If it's not a timeout error, it might be an invalid selector - # Let's try to evaluate it as a JavaScript function as a fallback - try: - return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) - except Error: - raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " - "It should be either a valid CSS selector, a JavaScript function, " - "or explicitly prefixed with 'js:' or 'css:'.") - - async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): - wrapper_js = f""" - async () => {{ - const userFunction = {user_wait_function}; - const startTime = Date.now(); - while (true) {{ - if (await userFunction()) {{ - return true; - }} - if (Date.now() - startTime > {timeout}) {{ - throw new Error('Timeout waiting for condition'); - }} - await new Promise(resolve => setTimeout(resolve, 100)); - }} - }} - """ - - try: - await page.evaluate(wrapper_js) - except TimeoutError: - raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") - except Exception as e: - raise RuntimeError(f"Error in wait condition: {str(e)}") - - async def process_iframes(self, page): - # Find all iframes - iframes = await page.query_selector_all('iframe') - - for i, iframe in enumerate(iframes): - try: - # Add a unique identifier to the iframe - await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') - - # Get the frame associated with this iframe - frame = await iframe.content_frame() - - if frame: - # Wait for the frame to load - await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout - - # Extract the content of the iframe's body - iframe_content = await frame.evaluate('() => document.body.innerHTML') - - # Generate a unique class name for this iframe - class_name = f'extracted-iframe-content-{i}' - - # Replace the iframe with a div containing the extracted content - _iframe = iframe_content.replace('`', '\\`') - await page.evaluate(f""" - () => {{ - const iframe = document.getElementById('iframe-{i}'); - const div = document.createElement('div'); - div.innerHTML = `{_iframe}`; - div.className = '{class_name}'; - iframe.replaceWith(div); - }} - """) - else: - print(f"Warning: Could not access content frame for iframe {i}") - except Exception as e: - print(f"Error processing iframe {i}: {str(e)}") - - # Return the page object - return page - - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - response_headers = {} - status_code = None - - self._cleanup_expired_sessions() - session_id = kwargs.get("session_id") - - # Handle page creation differently for managed browser - if self.use_managed_browser: - if session_id: - # Reuse existing session if available - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not page: - # Create new page in default context if session doesn't exist - page = await self.default_context.new_page() - self.sessions[session_id] = (self.default_context, page, time.time()) - else: - # Create new page in default context for non-session requests - page = await self.default_context.new_page() - else: - if session_id: - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not context: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True - ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - await context.set_extra_http_headers(self.headers) - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None - ) - await context.set_extra_http_headers(self.headers) - - if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Inject scripts to override navigator properties - await context.add_init_script(""" - // Pass the Permissions Test. - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {}, - // Add other properties if necessary - }; - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], - }); - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], - }); - Object.defineProperty(document, 'hidden', { - get: () => false - }); - Object.defineProperty(document, 'visibilityState', { - get: () => 'visible' - }); - """) - - page = await context.new_page() - if kwargs.get("magic", False): - await stealth_async(page, stealth_config) - - # Add console message and error logging - if kwargs.get("log_console", False): - page.on("console", lambda msg: print(f"Console: {msg.text}")) - page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) - - try: - if self.verbose: - print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") - - if self.use_cached_html: - cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - if os.path.exists(cache_file_path): - html = "" - with open(cache_file_path, "r") as f: - html = f.read() - # retrieve response headers and status code from cache - with open(cache_file_path + ".meta", "r") as f: - meta = json.load(f) - response_headers = meta.get("response_headers", {}) - status_code = meta.get("status_code") - response = AsyncCrawlResponse( - html=html, response_headers=response_headers, status_code=status_code - ) - return response - - if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page) - - # response = await page.goto( - # url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) - # ) - - # Add retry logic for HTTP2 errors - max_retries = kwargs.get("max_retries", 3) - current_try = 0 - - while current_try < max_retries: - try: - response = await page.goto( - url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "networkidle"), - timeout=kwargs.get("page_timeout", 60000) - ) - break - except Exception as e: - current_try += 1 - if "ERR_HTTP2_PROTOCOL_ERROR" in str(e): - if current_try < max_retries: - # Add exponential backoff - await asyncio.sleep(2 ** current_try) - # Try with different protocol - if 'args' not in kwargs: - kwargs['args'] = [] - kwargs['args'].extend(['--disable-http2']) - continue - if current_try == max_retries: - raise - - # response = await page.goto("about:blank") - # await page.evaluate(f"window.location.href = '{url}'") - - await self.execute_hook('after_goto', page) - - # Get status code and headers - status_code = response.status - response_headers = response.headers - else: - status_code = 200 - response_headers = {} - - # Replace the current wait_for_selector line with this more robust check: - try: - # First wait for body to exist, regardless of visibility - await page.wait_for_selector('body', state='attached', timeout=30000) - - # Then wait for it to become visible by checking CSS - await page.wait_for_function(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - } - """, timeout=30000) - - except Error as e: - # If waiting fails, let's try to diagnose the issue - visibility_info = await page.evaluate(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return { - display: style.display, - visibility: style.visibility, - opacity: style.opacity, - hasContent: body.innerHTML.length, - classList: Array.from(body.classList) - } - } - """) - - if self.verbose: - print(f"Body visibility debug info: {visibility_info}") - - # Even if body is hidden, we might still want to proceed - if kwargs.get('ignore_body_visibility', True): - if self.verbose: - print("Proceeding despite hidden body...") - pass - else: - raise Error(f"Body element is hidden: {visibility_info}") - - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - - js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) - if js_code: - if isinstance(js_code, str): - await page.evaluate(js_code) - elif isinstance(js_code, list): - for js in js_code: - await page.evaluate(js) - - await page.wait_for_load_state('networkidle') - # Check for on execution event - await self.execute_hook('on_execution_started', page) - - if kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Simulate user interactions - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press('ArrowDown') - - # Handle the wait_for parameter - wait_for = kwargs.get("wait_for") - if wait_for: - try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) - except Exception as e: - raise RuntimeError(f"Wait condition failed: {str(e)}") - - # Update image dimensions - update_image_dimensions_js = """ - () => { - return new Promise((resolve) => { - const filterImage = (img) => { - // Filter out images that are too small - if (img.width < 100 && img.height < 100) return false; - - // Filter out images that are not visible - const rect = img.getBoundingClientRect(); - if (rect.width === 0 || rect.height === 0) return false; - - // Filter out images with certain class names (e.g., icons, thumbnails) - if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; - - // Filter out images with certain patterns in their src (e.g., placeholder images) - if (img.src.includes('placeholder') || img.src.includes('icon')) return false; - - return true; - }; - - const images = Array.from(document.querySelectorAll('img')).filter(filterImage); - let imagesLeft = images.length; - - if (imagesLeft === 0) { - resolve(); - return; - } - - const checkImage = (img) => { - if (img.complete && img.naturalWidth !== 0) { - img.setAttribute('width', img.naturalWidth); - img.setAttribute('height', img.naturalHeight); - imagesLeft--; - if (imagesLeft === 0) resolve(); - } - }; - - images.forEach(img => { - checkImage(img); - if (!img.complete) { - img.onload = () => { - checkImage(img); - }; - img.onerror = () => { - imagesLeft--; - if (imagesLeft === 0) resolve(); - }; - } - }); - - // Fallback timeout of 5 seconds - // setTimeout(() => resolve(), 5000); - resolve(); - }); - } - """ - await page.evaluate(update_image_dimensions_js) - - # Wait a bit for any onload events to complete - await page.wait_for_timeout(100) - - # Process iframes - if kwargs.get("process_iframes", False): - page = await self.process_iframes(page) - - await self.execute_hook('before_retrieve_html', page) - # Check if delay_before_return_html is set then wait for that time - delay_before_return_html = kwargs.get("delay_before_return_html") - if delay_before_return_html: - await asyncio.sleep(delay_before_return_html) - - # Check for remove_overlay_elements parameter - if kwargs.get("remove_overlay_elements", False): - await self.remove_overlay_elements(page) - - html = await page.content() - await self.execute_hook('before_return_html', page, html) - - # Check if kwargs has screenshot=True then take screenshot - screenshot_data = None - if kwargs.get("screenshot"): - # Check we have screenshot_wait_for parameter, if we have simply wait for that time - screenshot_wait_for = kwargs.get("screenshot_wait_for") - if screenshot_wait_for: - await asyncio.sleep(screenshot_wait_for) - screenshot_data = await self.take_screenshot(page) - - if self.verbose: - print(f"[LOG] ✅ Crawled {url} successfully!") - - if self.use_cached_html: - cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - with open(cache_file_path, "w", encoding="utf-8") as f: - f.write(html) - # store response headers and status code in cache - with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: - json.dump({ - "response_headers": response_headers, - "status_code": status_code - }, f) - - async def get_delayed_content(delay: float = 5.0) -> str: - if self.verbose: - print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") - await asyncio.sleep(delay) - return await page.content() - - response = AsyncCrawlResponse( - html=html, - response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - get_delayed_content=get_delayed_content - ) - return response - except Error as e: - raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") - # finally: - # if not session_id: - # await page.close() - # await context.close() - - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.crawl(url, **kwargs) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [result if not isinstance(result, Exception) else str(result) for result in results] - - async def remove_overlay_elements(self, page: Page) -> None: - """ - Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. - - Args: - page (Page): The Playwright page instance - """ - remove_overlays_js = """ - async () => { - // Function to check if element is visible - const isVisible = (elem) => { - const style = window.getComputedStyle(elem); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - }; - - // Common selectors for popups and overlays - const commonSelectors = [ - // Close buttons first - 'button[class*="close" i]', 'button[class*="dismiss" i]', - 'button[aria-label*="close" i]', 'button[title*="close" i]', - 'a[class*="close" i]', 'span[class*="close" i]', - - // Cookie notices - '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', - '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', - - // Newsletter/subscription dialogs - '[class*="newsletter" i]', '[class*="subscribe" i]', - - // Generic popups/modals - '[class*="popup" i]', '[class*="modal" i]', - '[class*="overlay" i]', '[class*="dialog" i]', - '[role="dialog"]', '[role="alertdialog"]' - ]; - - // Try to click close buttons first - for (const selector of commonSelectors.slice(0, 6)) { - const closeButtons = document.querySelectorAll(selector); - for (const button of closeButtons) { - if (isVisible(button)) { - try { - button.click(); - await new Promise(resolve => setTimeout(resolve, 100)); - } catch (e) { - console.log('Error clicking button:', e); - } - } - } - } - - // Remove remaining overlay elements - const removeOverlays = () => { - // Find elements with high z-index - const allElements = document.querySelectorAll('*'); - for (const elem of allElements) { - const style = window.getComputedStyle(elem); - const zIndex = parseInt(style.zIndex); - const position = style.position; - - if ( - isVisible(elem) && - (zIndex > 999 || position === 'fixed' || position === 'absolute') && - ( - elem.offsetWidth > window.innerWidth * 0.5 || - elem.offsetHeight > window.innerHeight * 0.5 || - style.backgroundColor.includes('rgba') || - parseFloat(style.opacity) < 1 - ) - ) { - elem.remove(); - } - } - - // Remove elements matching common selectors - for (const selector of commonSelectors) { - const elements = document.querySelectorAll(selector); - elements.forEach(elem => { - if (isVisible(elem)) { - elem.remove(); - } - }); - } - }; - - // Remove overlay elements - removeOverlays(); - - // Remove any fixed/sticky position elements at the top/bottom - const removeFixedElements = () => { - const elements = document.querySelectorAll('*'); - elements.forEach(elem => { - const style = window.getComputedStyle(elem); - if ( - (style.position === 'fixed' || style.position === 'sticky') && - isVisible(elem) - ) { - elem.remove(); - } - }); - }; - - removeFixedElements(); - - // Remove empty block elements as: div, p, span, etc. - const removeEmptyBlockElements = () => { - const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); - blockElements.forEach(elem => { - if (elem.innerText.trim() === '') { - elem.remove(); - } - }); - }; - - // Remove margin-right and padding-right from body (often added by modal scripts) - document.body.style.marginRight = '0px'; - document.body.style.paddingRight = '0px'; - document.body.style.overflow = 'auto'; - - // Wait a bit for any animations to complete - await new Promise(resolve => setTimeout(resolve, 100)); - } - """ - - try: - await page.evaluate(remove_overlays_js) - await page.wait_for_timeout(500) # Wait for any animations to complete - except Exception as e: - if self.verbose: - print(f"Warning: Failed to remove overlay elements: {str(e)}") - - async def take_screenshot(self, page: Page) -> str: - try: - # The page is already loaded, just take the screenshot - screenshot = await page.screenshot(full_page=True) - return base64.b64encode(screenshot).decode('utf-8') - except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) - - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) - - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') - finally: - await page.close() - diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 273ca6c9..c52e3db6 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -91,7 +91,8 @@ class AsyncDatabaseManager: links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", screenshot TEXT DEFAULT "", - response_headers TEXT DEFAULT "{}" -- New column added + response_headers TEXT DEFAULT "{}", + downloaded_files TEXT DEFAULT "{}" -- New column added ) ''') @@ -108,7 +109,7 @@ class AsyncDatabaseManager: column_names = await self.execute_with_retry(_check_columns) # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers'] + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] for column in new_columns: if column not in column_names: @@ -130,7 +131,7 @@ class AsyncDatabaseManager: async def _get(db): async with db.execute( ''' - SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files FROM crawled_data WHERE url = ? ''', (url,) @@ -149,7 +150,8 @@ class AsyncDatabaseManager: json.loads(row[7] or '{}'), # links json.loads(row[8] or '{}'), # metadata row[9], # screenshot - json.loads(row[10] or '{}') # response_headers + json.loads(row[10] or '{}'), # response_headers + json.loads(row[11] or '[]') # downloaded_files ) return None @@ -171,15 +173,16 @@ class AsyncDatabaseManager: links: str = "{}", metadata: str = "{}", screenshot: str = "", - response_headers: str = "{}" # New parameter added + response_headers: str = "{}", + downloaded_files: str = "[]" ): """Cache URL data with retry logic""" async def _cache(db): await db.execute(''' INSERT INTO crawled_data ( - url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, @@ -190,8 +193,9 @@ class AsyncDatabaseManager: links = excluded.links, metadata = excluded.metadata, screenshot = excluded.screenshot, - response_headers = excluded.response_headers -- Update response_headers - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers)) + response_headers = excluded.response_headers, -- Update response_headers + downloaded_files = excluded.downloaded_files + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files)) try: await self.execute_with_retry(_cache) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8415f9b9..cec1ace0 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -160,12 +160,35 @@ class AsyncWebCrawler: if async_response: crawl_result.status_code = async_response.status_code crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files else: crawl_result.status_code = 200 crawl_result.response_headers = cached[10] + # crawl_result.downloaded_files = cached[11] crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) + + + if not is_raw_html: + if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url( + url = url, + html = html, + cleaned_html = crawl_result.cleaned_html, + markdown = crawl_result.markdown, + extracted_content = extracted_content, + success = True, + media = json.dumps(crawl_result.media), + links = json.dumps(crawl_result.links), + metadata = json.dumps(crawl_result.metadata), + screenshot=screenshot, + response_headers=json.dumps(crawl_result.response_headers), + downloaded_files=json.dumps(crawl_result.downloaded_files), + + ) + + return crawl_result except Exception as e: @@ -233,8 +256,6 @@ class AsyncWebCrawler: css_selector: str, screenshot: str, verbose: bool, - is_cached: bool, - async_response: Optional[AsyncCrawlResponse], **kwargs, ) -> CrawlResult: t = time.time() @@ -298,28 +319,6 @@ class AsyncWebCrawler: screenshot = None if not screenshot else screenshot - response_headers = "{}" # Default value - if async_response: - # Serialize response_headers dict to JSON string - response_headers = json.dumps(async_response.response_headers, ensure_ascii=False) - - - if not kwargs.get("is_raw_html", False): - if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - response_headers=response_headers, - ) - return CrawlResult( url=url, html=html, diff --git a/crawl4ai/content_cleaning_strategy.py b/crawl4ai/content_cleaning_strategy.py deleted file mode 100644 index b8a5053d..00000000 --- a/crawl4ai/content_cleaning_strategy.py +++ /dev/null @@ -1,198 +0,0 @@ -from bs4 import BeautifulSoup, Tag -import re -from typing import Optional - -class ContentCleaningStrategy: - def __init__(self): - # Precompile regex patterns for performance - self.negative_patterns = re.compile(r'nav|footer|header|sidebar|ads|comment', re.I) - self.positive_patterns = re.compile(r'content|article|main|post', re.I) - self.priority_tags = {'article', 'main', 'section', 'div'} - self.non_content_tags = {'nav', 'footer', 'header', 'aside'} - # Thresholds - self.text_density_threshold = 9.0 - self.min_word_count = 50 - self.link_density_threshold = 0.2 - self.max_dom_depth = 10 # To prevent excessive DOM traversal - - def clean(self, clean_html: str, soup = None) -> str: - """ - Main function that takes cleaned HTML and returns super cleaned HTML. - - Args: - clean_html (str): The cleaned HTML content. - - Returns: - str: The super cleaned HTML containing only the main content. - """ - try: - if not clean_html or not isinstance(clean_html, str): - return '' - if not soup: - # soup = BeautifulSoup(clean_html, 'html.parser') - soup = BeautifulSoup(clean_html, 'lxml') - main_content = self.extract_main_content(soup) - if main_content: - super_clean_element = self.clean_element(main_content) - return super_clean_element.encode_contents().decode('utf-8') - else: - return '' - except Exception: - # Handle exceptions silently or log them as needed - return '' - - def extract_main_content(self, soup) -> Optional[Tag]: - """ - Identifies and extracts the main content element from the HTML. - - Args: - soup (BeautifulSoup): The parsed HTML soup. - - Returns: - Optional[Tag]: The Tag object containing the main content, or None if not found. - """ - candidates = [] - for element in soup.find_all(self.priority_tags): - if self.is_non_content_tag(element): - continue - if self.has_negative_class_id(element): - continue - score = self.calculate_content_score(element) - candidates.append((score, element)) - - if not candidates: - return None - - # Sort candidates by score in descending order - candidates.sort(key=lambda x: x[0], reverse=True) - # Select the element with the highest score - best_element = candidates[0][1] - return best_element - - def calculate_content_score(self, element: Tag) -> float: - """ - Calculates a score for an element based on various heuristics. - - Args: - element (Tag): The HTML element to score. - - Returns: - float: The content score of the element. - """ - score = 0.0 - - if self.is_priority_tag(element): - score += 5.0 - if self.has_positive_class_id(element): - score += 3.0 - if self.has_negative_class_id(element): - score -= 3.0 - if self.is_high_text_density(element): - score += 2.0 - if self.is_low_link_density(element): - score += 2.0 - if self.has_sufficient_content(element): - score += 2.0 - if self.has_headings(element): - score += 3.0 - - dom_depth = self.calculate_dom_depth(element) - score += min(dom_depth, self.max_dom_depth) * 0.5 # Adjust weight as needed - - return score - - def is_priority_tag(self, element: Tag) -> bool: - """Checks if the element is a priority tag.""" - return element.name in self.priority_tags - - def is_non_content_tag(self, element: Tag) -> bool: - """Checks if the element is a non-content tag.""" - return element.name in self.non_content_tags - - def has_negative_class_id(self, element: Tag) -> bool: - """Checks if the element has negative indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.negative_patterns.search(class_id)) - - def has_positive_class_id(self, element: Tag) -> bool: - """Checks if the element has positive indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.positive_patterns.search(class_id)) - - @staticmethod - def get_attr_str(attr) -> str: - """Converts an attribute value to a string.""" - if isinstance(attr, list): - return ' '.join(attr) - elif isinstance(attr, str): - return attr - else: - return '' - - def is_high_text_density(self, element: Tag) -> bool: - """Determines if the element has high text density.""" - text_density = self.calculate_text_density(element) - return text_density > self.text_density_threshold - - def calculate_text_density(self, element: Tag) -> float: - """Calculates the text density of an element.""" - text_length = len(element.get_text(strip=True)) - tag_count = len(element.find_all()) - tag_count = tag_count or 1 # Prevent division by zero - return text_length / tag_count - - def is_low_link_density(self, element: Tag) -> bool: - """Determines if the element has low link density.""" - link_density = self.calculate_link_density(element) - return link_density < self.link_density_threshold - - def calculate_link_density(self, element: Tag) -> float: - """Calculates the link density of an element.""" - text = element.get_text(strip=True) - if not text: - return 0.0 - link_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) - return len(link_text) / len(text) if text else 0.0 - - def has_sufficient_content(self, element: Tag) -> bool: - """Checks if the element has sufficient word count.""" - word_count = len(element.get_text(strip=True).split()) - return word_count >= self.min_word_count - - def calculate_dom_depth(self, element: Tag) -> int: - """Calculates the depth of an element in the DOM tree.""" - depth = 0 - current_element = element - while current_element.parent and depth < self.max_dom_depth: - depth += 1 - current_element = current_element.parent - return depth - - def has_headings(self, element: Tag) -> bool: - """Checks if the element contains heading tags.""" - return bool(element.find(['h1', 'h2', 'h3'])) - - def clean_element(self, element: Tag) -> Tag: - """ - Cleans the selected element by removing unnecessary attributes and nested non-content elements. - - Args: - element (Tag): The HTML element to clean. - - Returns: - Tag: The cleaned HTML element. - """ - for tag in element.find_all(['script', 'style', 'aside']): - tag.decompose() - for tag in element.find_all(): - attrs = dict(tag.attrs) - for attr in attrs: - if attr in ['style', 'onclick', 'onmouseover', 'align', 'bgcolor']: - del tag.attrs[attr] - return element diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py new file mode 100644 index 00000000..850ebf11 --- /dev/null +++ b/crawl4ai/content_filter_strategy.py @@ -0,0 +1,344 @@ +import os +import re +import time +from bs4 import BeautifulSoup, Tag +from typing import List, Tuple, Dict +from rank_bm25 import BM25Okapi +import nltk +from time import perf_counter +from html5lib import parse, treebuilders +from time import perf_counter +from collections import deque +from bs4 import BeautifulSoup, NavigableString, Tag +from .utils import clean_tokens +from abc import ABC, abstractmethod + +class RelevantContentFilter(ABC): + def __init__(self, user_query: str = None): + self.user_query = user_query + self.included_tags = { + # Primary structure + 'article', 'main', 'section', 'div', + # List structures + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + # Text content + 'p', 'span', 'blockquote', 'pre', 'code', + # Headers + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + # Tables + 'table', 'thead', 'tbody', 'tr', 'td', 'th', + # Other semantic elements + 'figure', 'figcaption', 'details', 'summary', + # Text formatting + 'em', 'strong', 'b', 'i', 'mark', 'small', + # Rich content + 'time', 'address', 'cite', 'q' + } + self.excluded_tags = { + 'nav', 'footer', 'header', 'aside', 'script', + 'style', 'form', 'iframe', 'noscript' + } + self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} + self.negative_patterns = re.compile( + r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share', + re.I + ) + self.min_word_count = 2 + + @abstractmethod + def filter_content(self, html: str) -> List[str]: + """Abstract method to be implemented by specific filtering strategies""" + pass + + def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str: + """Common method to extract page metadata with fallbacks""" + if self.user_query: + return self.user_query + + query_parts = [] + + # Title + if soup.title: + query_parts.append(soup.title.string) + elif soup.find('h1'): + query_parts.append(soup.find('h1').get_text()) + + # Meta tags + temp = "" + for meta_name in ['keywords', 'description']: + meta = soup.find('meta', attrs={'name': meta_name}) + if meta and meta.get('content'): + query_parts.append(meta['content']) + temp += meta['content'] + + # If still empty, grab first significant paragraph + if not temp: + # Find the first tag P thatits text contains more than 50 characters + for p in body.find_all('p'): + if len(p.get_text()) > 150: + query_parts.append(p.get_text()[:150]) + break + + return ' '.join(filter(None, query_parts)) + + + def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]: + """ + Extracts text chunks from a BeautifulSoup body element while preserving order. + Returns list of tuples (text, tag_name) for classification. + + Args: + body: BeautifulSoup Tag object representing the body element + + Returns: + List of (text, tag_name) tuples + """ + # Tags to ignore - inline elements that shouldn't break text flow + INLINE_TAGS = { + 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code', + 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q', + 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', + 'textarea', 'time', 'tt', 'var' + } + + # Tags that typically contain meaningful headers + HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'} + + chunks = [] + current_text = [] + chunk_index = 0 + + def should_break_chunk(tag: Tag) -> bool: + """Determine if a tag should cause a break in the current text chunk""" + return ( + tag.name not in INLINE_TAGS + and not (tag.name == 'p' and len(current_text) == 0) + ) + + # Use deque for efficient push/pop operations + stack = deque([(body, False)]) + + while stack: + element, visited = stack.pop() + + if visited: + # End of block element - flush accumulated text + if current_text and should_break_chunk(element): + text = ' '.join(''.join(current_text).split()) + if text: + tag_type = 'header' if element.name in HEADER_TAGS else 'content' + chunks.append((chunk_index, text, tag_type, element)) + chunk_index += 1 + current_text = [] + continue + + if isinstance(element, NavigableString): + if str(element).strip(): + current_text.append(str(element).strip()) + continue + + # Pre-allocate children to avoid multiple list operations + children = list(element.children) + if not children: + continue + + # Mark block for revisit after processing children + stack.append((element, True)) + + # Add children in reverse order for correct processing + for child in reversed(children): + if isinstance(child, (Tag, NavigableString)): + stack.append((child, False)) + + # Handle any remaining text + if current_text: + text = ' '.join(''.join(current_text).split()) + if text: + chunks.append((chunk_index, text, 'content', body)) + + return chunks + + + def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: + """Common method for extracting text chunks""" + _text_cache = {} + def fast_text(element: Tag) -> str: + elem_id = id(element) + if elem_id in _text_cache: + return _text_cache[elem_id] + texts = [] + for content in element.contents: + if isinstance(content, str): + text = content.strip() + if text: + texts.append(text) + result = ' '.join(texts) + _text_cache[elem_id] = result + return result + + candidates = [] + index = 0 + + def dfs(element): + nonlocal index + if isinstance(element, Tag): + if element.name in self.included_tags: + if not self.is_excluded(element): + text = fast_text(element) + word_count = len(text.split()) + + # Headers pass through with adjusted minimum + if element.name in self.header_tags: + if word_count >= 3: # Minimal sanity check for headers + candidates.append((index, text, element)) + index += 1 + # Regular content uses standard minimum + elif word_count >= self.min_word_count: + candidates.append((index, text, element)) + index += 1 + + for child in element.children: + dfs(child) + + dfs(soup.body if soup.body else soup) + return candidates + + def is_excluded(self, tag: Tag) -> bool: + """Common method for exclusion logic""" + if tag.name in self.excluded_tags: + return True + class_id = ' '.join(filter(None, [ + ' '.join(tag.get('class', [])), + tag.get('id', '') + ])) + return bool(self.negative_patterns.search(class_id)) + + def clean_element(self, tag: Tag) -> str: + """Common method for cleaning HTML elements with minimal overhead""" + if not tag or not isinstance(tag, Tag): + return "" + + unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'} + unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'} + + # Use string builder pattern for better performance + builder = [] + + def render_tag(elem): + if not isinstance(elem, Tag): + if isinstance(elem, str): + builder.append(elem.strip()) + return + + if elem.name in unwanted_tags: + return + + # Start tag + builder.append(f'<{elem.name}') + + # Add cleaned attributes + attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs} + for key, value in attrs.items(): + builder.append(f' {key}="{value}"') + + builder.append('>') + + # Process children + for child in elem.children: + render_tag(child) + + # Close tag + builder.append(f'') + + try: + render_tag(tag) + return ''.join(builder) + except Exception: + return str(tag) # Fallback to original if anything fails + +class BM25ContentFilter(RelevantContentFilter): + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0): + super().__init__(user_query=user_query) + self.bm25_threshold = bm25_threshold + self.priority_tags = { + 'h1': 5.0, + 'h2': 4.0, + 'h3': 3.0, + 'title': 4.0, + 'strong': 2.0, + 'b': 1.5, + 'em': 1.5, + 'blockquote': 2.0, + 'code': 2.0, + 'pre': 1.5, + 'th': 1.5, # Table headers + } + + def filter_content(self, html: str) -> List[str]: + """Implements content filtering using BM25 algorithm with priority tag handling""" + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + body = soup.find('body') + query = self.extract_page_query(soup.find('head'), body) + candidates = self.extract_text_chunks(body) + + if not candidates: + return [] + + # Split into priority and regular candidates + priority_candidates = [] + regular_candidates = [] + + for index, chunk, tag_type, tag in candidates: + if tag.name in self.priority_tags: + priority_candidates.append((index, chunk, tag_type, tag)) + else: + regular_candidates.append((index, chunk, tag_type, tag)) + + # Process regular content with BM25 + tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates] + tokenized_query = query.lower().split() + + # Clean from stop words and noise + tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] + tokenized_query = clean_tokens(tokenized_query) + + bm25 = BM25Okapi(tokenized_corpus) + scores = bm25.get_scores(tokenized_query) + + # Score and boost regular candidates + scored_candidates = [ + (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag) + for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates) + ] + scored_candidates.sort(key=lambda x: x[0], reverse=True) + + # Process scored candidates + selected_tags = set() + selected_candidates = [] + + # First add all priority candidates + for index, chunk, tag_type, tag in priority_candidates: + tag_id = id(tag) + if tag_id not in selected_tags: + selected_candidates.append((index, chunk, tag)) + selected_tags.add(tag_id) + + # Then add scored regular candidates that meet threshold + for score, index, chunk, tag_type, tag in scored_candidates: + if score < self.bm25_threshold: + continue + tag_id = id(tag) + if tag_id not in selected_tags: + selected_candidates.append((index, chunk, tag)) + selected_tags.add(tag_id) + + if not selected_candidates: + return [] + + # Sort by original document order + selected_candidates.sort(key=lambda x: x[0]) + return [self.clean_element(tag) for _, _, tag in selected_candidates] + diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index a2dbbd96..9c81638c 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -8,7 +8,8 @@ from .config import * from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema -from .content_cleaning_strategy import ContentCleaningStrategy +# from .content_cleaning_strategy import ContentCleaningStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter from .utils import ( sanitize_input_encode, @@ -532,8 +533,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." if kwargs.get('fit_markdown', False): - cleaner = ContentCleaningStrategy() - fit_html = cleaner.clean(cleaned_html) + # cleaner = ContentCleaningStrategy() + # fit_html = cleaner.clean(cleaned_html) + # fit_markdown = h.handle(fit_html) + content_filter = BM25ContentFilter( + user_query= kwargs.get('fit_markdown_user_query', None), + bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + fit_html = content_filter.filter_content(html) + fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) fit_markdown = h.handle(fit_html) cleaned_html = sanitize_html(cleaned_html) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 4ac06797..cab4c45b 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,7 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional +from typing import List, Dict, Optional, Callable, Awaitable + + class UrlModel(BaseModel): url: HttpUrl @@ -12,6 +14,7 @@ class CrawlResult(BaseModel): cleaned_html: Optional[str] = None media: Dict[str, List[Dict]] = {} links: Dict[str, List[Dict]] = {} + downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None markdown: Optional[str] = None fit_markdown: Optional[str] = None @@ -21,4 +24,15 @@ class CrawlResult(BaseModel): error_message: Optional[str] = None session_id: Optional[str] = None response_headers: Optional[dict] = None - status_code: Optional[int] = None \ No newline at end of file + status_code: Optional[int] = None + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + downloaded_files: Optional[List[str]] = None + + class Config: + arbitrary_types_allowed = True diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d8bd6992..49483f43 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1054,3 +1054,58 @@ def is_external_url(url, base_domain): return False return False + +def clean_tokens(tokens: list[str]) -> list[str]: + # Set of tokens to remove + noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'} + + STOP_WORDS = { + 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', + 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', + 'to', 'was', 'were', 'will', 'with', + + # Pronouns + 'i', 'you', 'he', 'she', 'it', 'we', 'they', + 'me', 'him', 'her', 'us', 'them', + 'my', 'your', 'his', 'her', 'its', 'our', 'their', + 'mine', 'yours', 'hers', 'ours', 'theirs', + 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', + + # Common verbs + 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', + + # Prepositions + 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', + 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', + 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', + 'near', 'of', 'off', 'on', 'out', 'outside', 'over', 'past', 'through', + 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', + + # Conjunctions + 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', + 'although', 'because', 'since', 'unless', + + # Articles + 'a', 'an', 'the', + + # Other common words + 'this', 'that', 'these', 'those', + 'what', 'which', 'who', 'whom', 'whose', + 'when', 'where', 'why', 'how', + 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', + 'can', 'cannot', "can't", 'could', "couldn't", + 'may', 'might', 'must', "mustn't", + 'shall', 'should', "shouldn't", + 'will', "won't", 'would', "wouldn't", + 'not', "n't", 'no', 'nor', 'none' + } + + # Single comprehension, more efficient than multiple passes + return [token for token in tokens + if len(token) > 2 + and token not in noise + and token not in STOP_WORDS + and not token.startswith('↑') + and not token.startswith('▲') + and not token.startswith('⬆')] diff --git a/tests/async/test_async_doanloader.py b/tests/async/test_async_doanloader.py new file mode 100644 index 00000000..4798b4ca --- /dev/null +++ b/tests/async/test_async_doanloader.py @@ -0,0 +1,229 @@ +import os +import sys +import asyncio +import shutil +from typing import List +import tempfile +import time + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.async_webcrawler import AsyncWebCrawler + +class TestDownloads: + def __init__(self): + self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_") + self.download_dir = os.path.join(self.temp_dir, "downloads") + os.makedirs(self.download_dir, exist_ok=True) + self.results: List[str] = [] + + def cleanup(self): + shutil.rmtree(self.temp_dir) + + def log_result(self, test_name: str, success: bool, message: str = ""): + result = f"{'✅' if success else '❌'} {test_name}: {message}" + self.results.append(result) + print(result) + + async def test_basic_download(self): + """Test basic file download functionality""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + # Python.org downloads page typically has stable download links + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click first download link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Basic Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Basic Download", False, str(e)) + + async def test_persistent_context_download(self): + """Test downloads with persistent context""" + try: + user_data_dir = os.path.join(self.temp_dir, "user_data") + os.makedirs(user_data_dir, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + use_persistent_context=True, + user_data_dir=user_data_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Persistent Context Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Persistent Context Download", False, str(e)) + + async def test_multiple_downloads(self): + """Test multiple simultaneous downloads""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click multiple download links + const downloadLinks = document.querySelectorAll('a[href$=".exe"]'); + downloadLinks.forEach(link => link.click()); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 1 + self.log_result( + "Multiple Downloads", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded" + ) + except Exception as e: + self.log_result("Multiple Downloads", False, str(e)) + + async def test_different_browsers(self): + """Test downloads across different browser types""" + browsers = ["chromium", "firefox", "webkit"] + + for browser_type in browsers: + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + browser_type=browser_type, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + f"{browser_type.title()} Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result(f"{browser_type.title()} Download", False, str(e)) + + async def test_edge_cases(self): + """Test various edge cases""" + + # Test 1: Downloads without specifying download path + try: + async with AsyncWebCrawler( + accept_downloads=True, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result( + "Default Download Path", + True, + f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}" + ) + except Exception as e: + self.log_result("Default Download Path", False, str(e)) + + # Test 2: Downloads with invalid path + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path="/invalid/path/that/doesnt/exist", + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result("Invalid Download Path", False, "Should have raised an error") + except Exception as e: + self.log_result("Invalid Download Path", True, "Correctly handled invalid path") + + # Test 3: Download with accept_downloads=False + try: + async with AsyncWebCrawler( + accept_downloads=False, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + success = result.downloaded_files is None + self.log_result( + "Disabled Downloads", + success, + "Correctly ignored downloads" if success else "Unexpectedly downloaded files" + ) + except Exception as e: + self.log_result("Disabled Downloads", False, str(e)) + + async def run_all_tests(self): + """Run all test cases""" + print("\n🧪 Running Download Tests...\n") + + test_methods = [ + self.test_basic_download, + self.test_persistent_context_download, + self.test_multiple_downloads, + self.test_different_browsers, + self.test_edge_cases + ] + + for test in test_methods: + print(f"\n📝 Running {test.__doc__}...") + await test() + await asyncio.sleep(2) # Brief pause between tests + + print("\n📊 Test Results Summary:") + for result in self.results: + print(result) + + successes = len([r for r in self.results if '✅' in r]) + total = len(self.results) + print(f"\nTotal: {successes}/{total} tests passed") + + self.cleanup() + +async def main(): + tester = TestDownloads() + await tester.run_all_tests() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From 7f1ae5adcf8552f9520d93eeec446c6ea7cd57e6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 14 Nov 2024 22:51:51 +0800 Subject: [PATCH 07/50] Update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 33d09184..7a00aa2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +# [0.3.74] November 14, 2024 + +- In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. +- Another thing this commit introduces is the concept of the Relevance Content Filter. This is an improvement over Fit Markdown. This class of strategies aims to extract the main content from a given page - the part that really matters and is useful to be processed. One strategy has been created using the BM25 algorithm, which finds chunks of text from the web page relevant to its title, descriptions, and keywords, or supports a given user query and matches them. The result is then returned to the main engine to be converted to Markdown. Plans include adding approaches using language models as well. +- The cache database was updated to hold information about response headers and downloaded files. + + # Changelog - November 13, 2024 ### Added From 1f269f98344f08bc3390a4f9ec689787cdf5b59b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 15 Nov 2024 18:11:11 +0800 Subject: [PATCH 08/50] test(content_filter): add comprehensive tests for BM25ContentFilter functionality --- tests/async/test_content_filter.py | 175 +++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 tests/async/test_content_filter.py diff --git a/tests/async/test_content_filter.py b/tests/async/test_content_filter.py new file mode 100644 index 00000000..a873c414 --- /dev/null +++ b/tests/async/test_content_filter.py @@ -0,0 +1,175 @@ +import os, sys +import pytest +from bs4 import BeautifulSoup +from typing import List + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.content_filter_strategy import BM25ContentFilter + +@pytest.fixture +def basic_html(): + return """ + + + Test Article + + + + +

    Main Heading

    +
    +

    This is a long paragraph with more than fifty words. It continues with more text to ensure we meet the minimum word count threshold. We need to make sure this paragraph is substantial enough to be considered for extraction according to our filtering rules. This should be enough words now.

    + +
    + + + """ + +@pytest.fixture +def wiki_html(): + return """ + + + Wikipedia Article + + +

    Article Title

    +

    Section 1

    +

    Short but important section header description.

    +
    +

    Long paragraph with sufficient words to meet the minimum threshold. This paragraph continues with more text to ensure we have enough content for proper testing. We need to make sure this has enough words to pass our filters and be considered valid content for extraction purposes.

    +
    + + + """ + +@pytest.fixture +def no_meta_html(): + return """ + + +

    Simple Page

    +

    First paragraph that should be used as fallback for query when no meta tags exist. This text needs to be long enough to serve as a meaningful fallback for our content extraction process.

    + + + """ + +class TestBM25ContentFilter: + def test_basic_extraction(self, basic_html): + """Test basic content extraction functionality""" + filter = BM25ContentFilter() + contents = filter.filter_content(basic_html) + + assert contents, "Should extract content" + assert len(contents) >= 1, "Should extract at least one content block" + assert "long paragraph" in ' '.join(contents).lower() + assert "navigation" not in ' '.join(contents).lower() + + def test_user_query_override(self, basic_html): + """Test that user query overrides metadata extraction""" + user_query = "specific test query" + filter = BM25ContentFilter(user_query=user_query) + + # Access internal state to verify query usage + soup = BeautifulSoup(basic_html, 'lxml') + extracted_query = filter.extract_page_query(soup.find('head')) + + assert extracted_query == user_query + assert "Test description" not in extracted_query + + def test_header_extraction(self, wiki_html): + """Test that headers are properly extracted despite length""" + filter = BM25ContentFilter() + contents = filter.filter_content(wiki_html) + + combined_content = ' '.join(contents).lower() + assert "section 1" in combined_content, "Should include section header" + assert "article title" in combined_content, "Should include main title" + + def test_no_metadata_fallback(self, no_meta_html): + """Test fallback behavior when no metadata is present""" + filter = BM25ContentFilter() + contents = filter.filter_content(no_meta_html) + + assert contents, "Should extract content even without metadata" + assert "First paragraph" in ' '.join(contents), "Should use first paragraph content" + + def test_empty_input(self): + """Test handling of empty input""" + filter = BM25ContentFilter() + assert filter.filter_content("") == [] + assert filter.filter_content(None) == [] + + def test_malformed_html(self): + """Test handling of malformed HTML""" + malformed_html = "

    Unclosed paragraph

    Nested content

    " + filter = BM25ContentFilter() + contents = filter.filter_content(malformed_html) + + assert isinstance(contents, list), "Should return list even with malformed HTML" + + def test_threshold_behavior(self, basic_html): + """Test different BM25 threshold values""" + strict_filter = BM25ContentFilter(bm25_threshold=2.0) + lenient_filter = BM25ContentFilter(bm25_threshold=0.5) + + strict_contents = strict_filter.filter_content(basic_html) + lenient_contents = lenient_filter.filter_content(basic_html) + + assert len(strict_contents) <= len(lenient_contents), \ + "Strict threshold should extract fewer elements" + + def test_html_cleaning(self, basic_html): + """Test HTML cleaning functionality""" + filter = BM25ContentFilter() + contents = filter.filter_content(basic_html) + + cleaned_content = ' '.join(contents) + assert 'class=' not in cleaned_content, "Should remove class attributes" + assert 'style=' not in cleaned_content, "Should remove style attributes" + assert ' +
    {'

    Test content. ' * 1000}

    + + """ + filter = BM25ContentFilter() + contents = filter.filter_content(large_html) + assert contents, "Should handle large content blocks" + + @pytest.mark.parametrize("unwanted_tag", [ + 'script', 'style', 'nav', 'footer', 'header' + ]) + def test_excluded_tags(self, unwanted_tag): + """Test that specific tags are properly excluded""" + html = f""" + + <{unwanted_tag}>Should not appear +

    Should appear

    + + """ + filter = BM25ContentFilter() + contents = filter.filter_content(html) + + combined_content = ' '.join(contents).lower() + assert "should not appear" not in combined_content + + def test_performance(self, basic_html): + """Test performance with timer""" + filter = BM25ContentFilter() + + import time + start = time.perf_counter() + filter.filter_content(basic_html) + duration = time.perf_counter() - start + + assert duration < 1.0, f"Processing took too long: {duration:.2f} seconds" + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file From ae7ebc0bd82e6d621f0d13a8a22d537f31dff0f6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 15 Nov 2024 20:16:13 +0800 Subject: [PATCH 09/50] chore: update .gitignore and enhance changelog with major feature additions and examples --- .gitignore | 3 +- CHANGELOG.md | 30 +++++ docs/examples/v0.3.74.overview.py | 195 ++++++++++++++++++++++++++++++ 3 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 docs/examples/v0.3.74.overview.py diff --git a/.gitignore b/.gitignore index aca02959..0acec10f 100644 --- a/.gitignore +++ b/.gitignore @@ -209,4 +209,5 @@ git_issues.md .tests/ .issues/ .docs/ -.issues/ \ No newline at end of file +.issues/ +.gitboss/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a00aa2e..e82fa6a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,35 @@ # Changelog + +## Version 0.3.74, Major Changes + +1. **File Download Processing** (Nov 14, 2024) + - Added capability for users to specify download folders + - Implemented file download tracking in crowd result object + - Created new file: `tests/async/test_async_doanloader.py` + +2. **Content Filtering Improvements** (Nov 14, 2024) + - Introduced Relevance Content Filter as an improvement over Fit Markdown + - Implemented BM25 algorithm for content relevance matching + - Added new file: `crawl4ai/content_filter_strategy.py` + - Removed deprecated: `crawl4ai/content_cleaning_strategy.py` + +3. **Local File and Raw HTML Support** (Nov 13, 2024) + - Added support for processing local files + - Implemented raw HTML input handling in AsyncWebCrawler + - Enhanced `crawl4ai/async_webcrawler.py` with significant performance improvements + +4. **Browser Management Enhancements** (Nov 12, 2024) + - Implemented new async crawler strategy using Playwright + - Introduced ManagedBrowser for better browser session handling + - Added support for persistent browser sessions + - Updated from playwright_stealth to tf-playwright-stealth + +5. **API Server Component** + - Added CORS support + - Implemented static file serving + - Enhanced root redirect functionality + # [0.3.74] November 14, 2024 - In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py new file mode 100644 index 00000000..579d05dd --- /dev/null +++ b/docs/examples/v0.3.74.overview.py @@ -0,0 +1,195 @@ +import asyncio +import os +from pathlib import Path +import aiohttp +import json +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +# 1. File Download Processing Example +async def download_example(): + """Example of downloading files from Python.org""" + # downloads_path = os.path.join(os.getcwd(), "downloads") + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + print(f"Downloads will be saved to: {downloads_path}") + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Find and click the first Windows installer link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + console.log('Found download link:', downloadLink.href); + downloadLink.click(); + } else { + console.log('No .exe download link found'); + } + """, + wait_for=5 # Wait 5 seconds to ensure download starts + ) + + if result.downloaded_files: + print("\nDownload successful!") + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB") + else: + print("\nNo files were downloaded") + +# 2. Content Filtering with BM25 Example +async def content_filtering_example(): + """Example of using the new BM25 content filtering""" + async with AsyncWebCrawler(verbose=True) as crawler: + # Create filter with custom query for OpenAI's blog + content_filter = BM25ContentFilter( + user_query="AI language models research innovation", + bm25_threshold=1.0 + ) + + result = await crawler.arun( + url="https://openai.com/blog", + extraction_strategy=content_filter + ) + + print(f"Filtered content: {result.extracted_content}") + +# 3. Local File and Raw HTML Processing Example +async def local_and_raw_html_example(): + """Example of processing local files and raw HTML""" + # Create a sample HTML file + sample_file = "sample.html" + with open(sample_file, "w") as f: + f.write(""" + +

    Test Content

    +

    This is a test paragraph.

    + + """) + + async with AsyncWebCrawler(verbose=True) as crawler: + # Process local file + local_result = await crawler.arun( + url=f"file://{os.path.abspath(sample_file)}" + ) + + # Process raw HTML + raw_html = """ + +

    Raw HTML Test

    +

    This is a test of raw HTML processing.

    + + """ + raw_result = await crawler.arun( + url=f"raw:{raw_html}" + ) + + # Clean up + os.remove(sample_file) + + print("Local file content:", local_result.markdown) + print("\nRaw HTML content:", raw_result.markdown) + +# 4. Browser Management Example +async def browser_management_example(): + """Example of using enhanced browser management features""" + # Use the specified user directory path + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") + os.makedirs(user_data_dir, exist_ok=True) + + print(f"Browser profile will be saved to: {user_data_dir}") + + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + headless=False, + verbose=True + ) as crawler: + # Use GitHub as an example - it's a good test for browser management + # because it requires proper browser handling + result = await crawler.arun( + url="https://github.com/trending", + session_id="persistent_session_1", + js_code=""" + // Custom JavaScript to execute on GitHub's trending page + const repos = document.querySelectorAll('article.Box-row'); + const data = Array.from(repos).map(repo => ({ + name: repo.querySelector('h2')?.textContent?.trim(), + description: repo.querySelector('p')?.textContent?.trim(), + language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim() + })); + console.log('Trending repositories:', JSON.stringify(data, null, 2)); + """ + ) + + print("\nBrowser session result:", result.success) + if result.success: + print("Page title:", result.metadata.get('title', 'No title found')) + +# 5. API Usage Example +async def api_example(): + """Example of using the new API endpoints""" + async with aiohttp.ClientSession() as session: + # Submit crawl job + crawl_request = { + "urls": ["https://news.ycombinator.com"], # Hacker News as an example + "extraction_config": { + "type": "json_css", + "params": { + "selectors": { + "titles": ".title a", + "scores": ".score", + "comments": ".comment-tree" + } + } + }, + "crawler_params": { + "headless": True, + "use_managed_browser": True + }, + "screenshot": True, + "magic": True + } + + async with session.post( + "http://localhost:11235/crawl", + json=crawl_request + ) as response: + task_data = await response.json() + task_id = task_data["task_id"] + + # Check task status + async with session.get( + f"http://localhost:11235/task/{task_id}" + ) as status_response: + result = await status_response.json() + print(f"Task result: {result}") + +# Main execution +async def main(): + print("Running Crawl4AI feature examples...") + + print("\n1. Running Download Example:") + await download_example() + + print("\n2. Running Content Filtering Example:") + await content_filtering_example() + + print("\n3. Running Local and Raw HTML Example:") + await local_and_raw_html_example() + + print("\n4. Running Browser Management Example:") + await browser_management_example() + + print("\n5. Running API Example:") + await api_example() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file From d0014c67931a27f3969e257da59aa9b70527b4cf Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 14:54:41 +0800 Subject: [PATCH 10/50] New async database manager and migration support - Introduced AsyncDatabaseManager for async DB management. - Added migration feature to transition to file-based storage. - Enhanced web crawler with improved caching logic. - Updated requirements and setup for async processing. --- crawl4ai/async_database.3.73.py | 285 ++++++++++++++++++++++++++++++++ crawl4ai/async_database.py | 185 +++++++++++++++------ crawl4ai/async_webcrawler.py | 116 +++++++------ crawl4ai/config.py | 4 +- crawl4ai/migrations.py | 152 +++++++++++++++++ crawl4ai/utils.py | 27 +++ requirements.txt | 1 + setup.py | 34 +++- 8 files changed, 685 insertions(+), 119 deletions(-) create mode 100644 crawl4ai/async_database.3.73.py create mode 100644 crawl4ai/migrations.py diff --git a/crawl4ai/async_database.3.73.py b/crawl4ai/async_database.3.73.py new file mode 100644 index 00000000..f86c7f1d --- /dev/null +++ b/crawl4ai/async_database.3.73.py @@ -0,0 +1,285 @@ +import os +from pathlib import Path +import aiosqlite +import asyncio +from typing import Optional, Tuple, Dict +from contextlib import asynccontextmanager +import logging +import json # Added for serialization/deserialization +from .utils import ensure_content_dirs, generate_content_hash +import xxhash +import aiofiles +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +DB_PATH = os.path.join(Path.home(), ".crawl4ai") +os.makedirs(DB_PATH, exist_ok=True) +DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") + +class AsyncDatabaseManager: + def __init__(self, pool_size: int = 10, max_retries: int = 3): + self.db_path = DB_PATH + self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) + self.pool_size = pool_size + self.max_retries = max_retries + self.connection_pool: Dict[int, aiosqlite.Connection] = {} + self.pool_lock = asyncio.Lock() + self.connection_semaphore = asyncio.Semaphore(pool_size) + + async def initialize(self): + """Initialize the database and connection pool""" + await self.ainit_db() + + async def cleanup(self): + """Cleanup connections when shutting down""" + async with self.pool_lock: + for conn in self.connection_pool.values(): + await conn.close() + self.connection_pool.clear() + + @asynccontextmanager + async def get_connection(self): + """Connection pool manager""" + async with self.connection_semaphore: + task_id = id(asyncio.current_task()) + try: + async with self.pool_lock: + if task_id not in self.connection_pool: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + self.connection_pool[task_id] = conn + + yield self.connection_pool[task_id] + + except Exception as e: + logger.error(f"Connection error: {e}") + raise + finally: + async with self.pool_lock: + if task_id in self.connection_pool: + await self.connection_pool[task_id].close() + del self.connection_pool[task_id] + + async def execute_with_retry(self, operation, *args): + """Execute database operations with retry logic""" + for attempt in range(self.max_retries): + try: + async with self.get_connection() as db: + result = await operation(db, *args) + await db.commit() + return result + except Exception as e: + if attempt == self.max_retries - 1: + logger.error(f"Operation failed after {self.max_retries} attempts: {e}") + raise + await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff + + async def ainit_db(self): + """Initialize database schema""" + async def _init(db): + await db.execute(''' + CREATE TABLE IF NOT EXISTS crawled_data ( + url TEXT PRIMARY KEY, + html TEXT, + cleaned_html TEXT, + markdown TEXT, + extracted_content TEXT, + success BOOLEAN, + media TEXT DEFAULT "{}", + links TEXT DEFAULT "{}", + metadata TEXT DEFAULT "{}", + screenshot TEXT DEFAULT "", + response_headers TEXT DEFAULT "{}", + downloaded_files TEXT DEFAULT "{}" -- New column added + ) + ''') + + await self.execute_with_retry(_init) + await self.update_db_schema() + + async def update_db_schema(self): + """Update database schema if needed""" + async def _check_columns(db): + cursor = await db.execute("PRAGMA table_info(crawled_data)") + columns = await cursor.fetchall() + return [column[1] for column in columns] + + column_names = await self.execute_with_retry(_check_columns) + + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] + + for column in new_columns: + if column not in column_names: + await self.aalter_db_add_column(column) + + async def aalter_db_add_column(self, new_column: str): + """Add new column to the database""" + async def _alter(db): + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + logger.info(f"Added column '{new_column}' to the database.") + + await self.execute_with_retry(_alter) + + + async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: + """Retrieve cached URL data""" + async def _get(db): + async with db.execute( + ''' + SELECT url, html, cleaned_html, markdown, + extracted_content, success, media, links, + metadata, screenshot, response_headers, + downloaded_files + FROM crawled_data WHERE url = ? + ''', + (url,) + ) as cursor: + row = await cursor.fetchone() + if row: + # Load content from files using stored hashes + html = await self._load_content(row[1], 'html') if row[1] else "" + cleaned = await self._load_content(row[2], 'cleaned') if row[2] else "" + markdown = await self._load_content(row[3], 'markdown') if row[3] else "" + extracted = await self._load_content(row[4], 'extracted') if row[4] else "" + screenshot = await self._load_content(row[9], 'screenshots') if row[9] else "" + + return ( + row[0], # url + html or "", # Return empty string if file not found + cleaned or "", + markdown or "", + extracted or "", + row[5], # success + json.loads(row[6] or '{}'), # media + json.loads(row[7] or '{}'), # links + json.loads(row[8] or '{}'), # metadata + screenshot or "", + json.loads(row[10] or '{}'), # response_headers + json.loads(row[11] or '[]') # downloaded_files + ) + return None + + try: + return await self.execute_with_retry(_get) + except Exception as e: + logger.error(f"Error retrieving cached URL: {e}") + return None + + async def acache_url(self, url: str, html: str, cleaned_html: str, + markdown: str, extracted_content: str, success: bool, + media: str = "{}", links: str = "{}", + metadata: str = "{}", screenshot: str = "", + response_headers: str = "{}", downloaded_files: str = "[]"): + """Cache URL data with content stored in filesystem""" + + # Store content files and get hashes + html_hash = await self._store_content(html, 'html') + cleaned_hash = await self._store_content(cleaned_html, 'cleaned') + markdown_hash = await self._store_content(markdown, 'markdown') + extracted_hash = await self._store_content(extracted_content, 'extracted') + screenshot_hash = await self._store_content(screenshot, 'screenshots') + + async def _cache(db): + await db.execute(''' + INSERT INTO crawled_data ( + url, html, cleaned_html, markdown, + extracted_content, success, media, links, metadata, + screenshot, response_headers, downloaded_files + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + html = excluded.html, + cleaned_html = excluded.cleaned_html, + markdown = excluded.markdown, + extracted_content = excluded.extracted_content, + success = excluded.success, + media = excluded.media, + links = excluded.links, + metadata = excluded.metadata, + screenshot = excluded.screenshot, + response_headers = excluded.response_headers, + downloaded_files = excluded.downloaded_files + ''', (url, html_hash, cleaned_hash, markdown_hash, extracted_hash, + success, media, links, metadata, screenshot_hash, + response_headers, downloaded_files)) + + try: + await self.execute_with_retry(_cache) + except Exception as e: + logger.error(f"Error caching URL: {e}") + + + + async def aget_total_count(self) -> int: + """Get total number of cached URLs""" + async def _count(db): + async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor: + result = await cursor.fetchone() + return result[0] if result else 0 + + try: + return await self.execute_with_retry(_count) + except Exception as e: + logger.error(f"Error getting total count: {e}") + return 0 + + async def aclear_db(self): + """Clear all data from the database""" + async def _clear(db): + await db.execute('DELETE FROM crawled_data') + + try: + await self.execute_with_retry(_clear) + except Exception as e: + logger.error(f"Error clearing database: {e}") + + async def aflush_db(self): + """Drop the entire table""" + async def _flush(db): + await db.execute('DROP TABLE IF EXISTS crawled_data') + + try: + await self.execute_with_retry(_flush) + except Exception as e: + logger.error(f"Error flushing database: {e}") + + + async def _store_content(self, content: str, content_type: str) -> str: + """Store content in filesystem and return hash""" + if not content: + return "" + + content_hash = generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + # Only write if file doesn't exist + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: + """Load content from filesystem by hash""" + if not content_hash: + return None + + file_path = os.path.join(self.content_paths[content_type], content_hash) + try: + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + return await f.read() + except: + logger.error(f"Failed to load content: {file_path}") + return None + +# Create a singleton instance +async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index c52e3db6..f97d8131 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -6,7 +6,11 @@ from typing import Optional, Tuple, Dict from contextlib import asynccontextmanager import logging import json # Added for serialization/deserialization - +from .utils import ensure_content_dirs, generate_content_hash +from .models import CrawlResult +import xxhash +import aiofiles +from .config import NEED_MIGRATION # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -18,6 +22,7 @@ DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") class AsyncDatabaseManager: def __init__(self, pool_size: int = 10, max_retries: int = 3): self.db_path = DB_PATH + self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) self.pool_size = pool_size self.max_retries = max_retries self.connection_pool: Dict[int, aiosqlite.Connection] = {} @@ -26,8 +31,20 @@ class AsyncDatabaseManager: async def initialize(self): """Initialize the database and connection pool""" - await self.ainit_db() - + try: + logger.info("Initializing database...") + await self.ainit_db() + if NEED_MIGRATION: + await self.update_db_schema() + from .migrations import run_migration # Import here to avoid circular imports + await run_migration() + logger.info("Database initialization and migration completed successfully") + else: + logger.info("Database initialization completed successfully") + except Exception as e: + logger.error(f"Database initialization error: {e}") + logger.info("Database will be initialized on first use") + async def cleanup(self): """Cleanup connections when shutting down""" async with self.pool_lock: @@ -97,7 +114,7 @@ class AsyncDatabaseManager: ''') await self.execute_with_retry(_init) - await self.update_db_schema() + async def update_db_schema(self): """Update database schema if needed""" @@ -126,34 +143,59 @@ class AsyncDatabaseManager: await self.execute_with_retry(_alter) - async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: - """Retrieve cached URL data""" + async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: + """Retrieve cached URL data as CrawlResult""" async def _get(db): async with db.execute( - ''' - SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files - FROM crawled_data WHERE url = ? - ''', - (url,) + 'SELECT * FROM crawled_data WHERE url = ?', (url,) ) as cursor: row = await cursor.fetchone() - if row: - # Deserialize JSON fields - return ( - row[0], # url - row[1], # html - row[2], # cleaned_html - row[3], # markdown - row[4], # extracted_content - row[5], # success - json.loads(row[6] or '{}'), # media - json.loads(row[7] or '{}'), # links - json.loads(row[8] or '{}'), # metadata - row[9], # screenshot - json.loads(row[10] or '{}'), # response_headers - json.loads(row[11] or '[]') # downloaded_files - ) - return None + if not row: + return None + + # Get column names + columns = [description[0] for description in cursor.description] + # Create dict from row data + row_dict = dict(zip(columns, row)) + + # Load content from files using stored hashes + content_fields = { + 'html': row_dict['html'], + 'cleaned_html': row_dict['cleaned_html'], + 'markdown': row_dict['markdown'], + 'extracted_content': row_dict['extracted_content'], + 'screenshot': row_dict['screenshot'] + } + + for field, hash_value in content_fields.items(): + if hash_value: + content = await self._load_content( + hash_value, + field.split('_')[0] # Get content type from field name + ) + row_dict[field] = content or "" + else: + row_dict[field] = "" + + # Parse JSON fields + json_fields = ['media', 'links', 'metadata', 'response_headers'] + for field in json_fields: + try: + row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {} + except json.JSONDecodeError: + row_dict[field] = {} + + # Parse downloaded_files + try: + row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else [] + except json.JSONDecodeError: + row_dict['downloaded_files'] = [] + + # Remove any fields not in CrawlResult model + valid_fields = CrawlResult.__annotations__.keys() + filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields} + + return CrawlResult(**filtered_dict) try: return await self.execute_with_retry(_get) @@ -161,26 +203,27 @@ class AsyncDatabaseManager: logger.error(f"Error retrieving cached URL: {e}") return None - async def acache_url( - self, - url: str, - html: str, - cleaned_html: str, - markdown: str, - extracted_content: str, - success: bool, - media: str = "{}", - links: str = "{}", - metadata: str = "{}", - screenshot: str = "", - response_headers: str = "{}", - downloaded_files: str = "[]" - ): - """Cache URL data with retry logic""" + async def acache_url(self, result: CrawlResult): + """Cache CrawlResult data""" + # Store content files and get hashes + content_map = { + 'html': (result.html, 'html'), + 'cleaned_html': (result.cleaned_html or "", 'cleaned'), + 'markdown': (result.markdown or "", 'markdown'), + 'extracted_content': (result.extracted_content or "", 'extracted'), + 'screenshot': (result.screenshot or "", 'screenshots') + } + + content_hashes = {} + for field, (content, content_type) in content_map.items(): + content_hashes[field] = await self._store_content(content, content_type) + async def _cache(db): await db.execute(''' INSERT INTO crawled_data ( - url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files + url, html, cleaned_html, markdown, + extracted_content, success, media, links, metadata, + screenshot, response_headers, downloaded_files ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET @@ -189,13 +232,26 @@ class AsyncDatabaseManager: markdown = excluded.markdown, extracted_content = excluded.extracted_content, success = excluded.success, - media = excluded.media, - links = excluded.links, - metadata = excluded.metadata, + media = excluded.media, + links = excluded.links, + metadata = excluded.metadata, screenshot = excluded.screenshot, - response_headers = excluded.response_headers, -- Update response_headers + response_headers = excluded.response_headers, downloaded_files = excluded.downloaded_files - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files)) + ''', ( + result.url, + content_hashes['html'], + content_hashes['cleaned_html'], + content_hashes['markdown'], + content_hashes['extracted_content'], + result.success, + json.dumps(result.media), + json.dumps(result.links), + json.dumps(result.metadata or {}), + content_hashes['screenshot'], + json.dumps(result.response_headers or {}), + json.dumps(result.downloaded_files or []) + )) try: await self.execute_with_retry(_cache) @@ -234,6 +290,35 @@ class AsyncDatabaseManager: await self.execute_with_retry(_flush) except Exception as e: logger.error(f"Error flushing database: {e}") + + + async def _store_content(self, content: str, content_type: str) -> str: + """Store content in filesystem and return hash""" + if not content: + return "" + + content_hash = generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + # Only write if file doesn't exist + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: + """Load content from filesystem by hash""" + if not content_hash: + return None + + file_path = os.path.join(self.content_paths[content_type], content_hash) + try: + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + return await f.read() + except: + logger.error(f"Failed to load content: {file_path}") + return None # Create a singleton instance async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index cec1ace0..febc01d3 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -47,17 +47,17 @@ class AsyncWebCrawler: async def awarmup(self): # Print a message for crawl4ai and its version - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") if self.verbose: + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") print("[LOG] 🌤️ Warming up the AsyncWebCrawler") # await async_db_manager.ainit_db() - await async_db_manager.initialize() - await self.arun( - url="https://google.com/", - word_count_threshold=5, - bypass_cache=False, - verbose=False, - ) + # # await async_db_manager.initialize() + # await self.arun( + # url="https://google.com/", + # word_count_threshold=5, + # bypass_cache=False, + # verbose=False, + # ) self.ready = True if self.verbose: print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") @@ -73,6 +73,9 @@ class AsyncWebCrawler: screenshot: bool = False, user_agent: str = None, verbose=True, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, **kwargs, ) -> CrawlResult: """ @@ -89,6 +92,11 @@ class AsyncWebCrawler: CrawlResult: The result of the crawling and processing. """ try: + if disable_cache: + bypass_cache = True + no_cache_read = True + no_cache_write = True + extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose if not isinstance(extraction_strategy, ExtractionStrategy): @@ -108,36 +116,39 @@ class AsyncWebCrawler: is_raw_html = url.startswith("raw:") _url = url if not is_raw_html else "Raw HTML" - if is_web_url and not bypass_cache and not self.always_by_pass_cache: - cached = await async_db_manager.aget_cached_url(url) + start_time = time.perf_counter() + cached_result = None + if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: + cached_result = await async_db_manager.aget_cached_url(url) - # if not bypass_cache and not self.always_by_pass_cache: - # cached = await async_db_manager.aget_cached_url(url) - - if kwargs.get("warmup", True) and not self.ready: - return None - - if cached: - html = sanitize_input_encode(cached[1]) - extracted_content = sanitize_input_encode(cached[4]) + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") if screenshot: - screenshot_data = cached[9] + screenshot_data = cached_result.screenshot if not screenshot_data: - cached = None + cached_result = None + if verbose: + print( + f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) + if not cached or not html: - t1 = time.time() + t1 = time.perf_counter() + if user_agent: self.crawler_strategy.update_user_agent(user_agent) async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot - t2 = time.time() + t2 = time.perf_counter() if verbose: print( - f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" + f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" ) + t1 = time.perf_counter() crawl_result = await self.aprocess_html( url=url, html=html, @@ -163,30 +174,19 @@ class AsyncWebCrawler: crawl_result.downloaded_files = async_response.downloaded_files else: crawl_result.status_code = 200 - crawl_result.response_headers = cached[10] - # crawl_result.downloaded_files = cached[11] + crawl_result.response_headers = cached_result.response_headers if cached_result else {} crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) + if verbose: + print( + f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) - if not is_raw_html: - if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url = url, - html = html, - cleaned_html = crawl_result.cleaned_html, - markdown = crawl_result.markdown, - extracted_content = extracted_content, - success = True, - media = json.dumps(crawl_result.media), - links = json.dumps(crawl_result.links), - metadata = json.dumps(crawl_result.metadata), - screenshot=screenshot, - response_headers=json.dumps(crawl_result.response_headers), - downloaded_files=json.dumps(crawl_result.downloaded_files), - - ) + if not is_raw_html and not no_cache_write: + if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url(crawl_result) return crawl_result @@ -258,11 +258,11 @@ class AsyncWebCrawler: verbose: bool, **kwargs, ) -> CrawlResult: - t = time.time() + t = time.perf_counter() # Extract content from HTML try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" - t1 = time.time() + t1 = time.perf_counter() scrapping_strategy = WebScrapingStrategy() # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( @@ -276,10 +276,6 @@ class AsyncWebCrawler: ), **kwargs, ) - if verbose: - print( - f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds" - ) if result is None: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") @@ -295,13 +291,14 @@ class AsyncWebCrawler: media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) + + if verbose: + print( + f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" + ) - if extracted_content is None and extraction_strategy and chunking_strategy: - if verbose: - print( - f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}" - ) - + if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): + t1 = time.perf_counter() # Check if extraction strategy is type of JsonCssExtractionStrategy if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): extraction_strategy.verbose = verbose @@ -311,11 +308,10 @@ class AsyncWebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - - if verbose: - print( - f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds." - ) + if verbose: + print( + f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" + ) screenshot = None if not screenshot else screenshot diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 16638b6d..5bc284bf 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -52,4 +52,6 @@ SOCIAL_MEDIA_DOMAINS = [ # If image is in the first half of the total images extracted from the page IMAGE_SCORE_THRESHOLD = 2 -MAX_METRICS_HISTORY = 1000 \ No newline at end of file +MAX_METRICS_HISTORY = 1000 + +NEED_MIGRATION = True \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py new file mode 100644 index 00000000..77616086 --- /dev/null +++ b/crawl4ai/migrations.py @@ -0,0 +1,152 @@ +import os +import asyncio +import logging +from pathlib import Path +import aiosqlite +from typing import Optional +import xxhash +import aiofiles +import shutil +import time +from datetime import datetime + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class DatabaseMigration: + def __init__(self, db_path: str): + self.db_path = db_path + self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path)) + + def _ensure_content_dirs(self, base_path: str) -> dict: + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots' + } + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + return content_paths + + def _generate_content_hash(self, content: str) -> str: + x = xxhash.xxh64() + x.update(content.encode()) + content_hash = x.hexdigest() + return content_hash + # return hashlib.sha256(content.encode()).hexdigest() + + async def _store_content(self, content: str, content_type: str) -> str: + if not content: + return "" + + content_hash = self._generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def migrate_database(self): + """Migrate existing database to file-based storage""" + logger.info("Starting database migration...") + + try: + async with aiosqlite.connect(self.db_path) as db: + # Get all rows + async with db.execute( + '''SELECT url, html, cleaned_html, markdown, + extracted_content, screenshot FROM crawled_data''' + ) as cursor: + rows = await cursor.fetchall() + + migrated_count = 0 + for row in rows: + url, html, cleaned_html, markdown, extracted_content, screenshot = row + + # Store content in files and get hashes + html_hash = await self._store_content(html, 'html') + cleaned_hash = await self._store_content(cleaned_html, 'cleaned') + markdown_hash = await self._store_content(markdown, 'markdown') + extracted_hash = await self._store_content(extracted_content, 'extracted') + screenshot_hash = await self._store_content(screenshot, 'screenshots') + + # Update database with hashes + await db.execute(''' + UPDATE crawled_data + SET html = ?, + cleaned_html = ?, + markdown = ?, + extracted_content = ?, + screenshot = ? + WHERE url = ? + ''', (html_hash, cleaned_hash, markdown_hash, + extracted_hash, screenshot_hash, url)) + + migrated_count += 1 + if migrated_count % 100 == 0: + logger.info(f"Migrated {migrated_count} records...") + + await db.commit() + logger.info(f"Migration completed. {migrated_count} records processed.") + + except Exception as e: + logger.error(f"Migration failed: {e}") + raise + +async def backup_database(db_path: str) -> str: + """Create backup of existing database""" + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping backup.") + return None + + # Create backup with timestamp + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + backup_path = f"{db_path}.backup_{timestamp}" + + try: + # Wait for any potential write operations to finish + await asyncio.sleep(1) + + # Create backup + shutil.copy2(db_path, backup_path) + logger.info(f"Database backup created at: {backup_path}") + return backup_path + except Exception as e: + logger.error(f"Backup failed: {e}") + raise + +async def run_migration(db_path: Optional[str] = None): + """Run database migration""" + if db_path is None: + db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") + + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping migration.") + return + + # Create backup first + backup_path = await backup_database(db_path) + if not backup_path: + return + + migration = DatabaseMigration(db_path) + await migration.migrate_database() + +def main(): + """CLI entry point for migration""" + import argparse + parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage') + parser.add_argument('--db-path', help='Custom database path') + args = parser.parse_args() + + asyncio.run(run_migration(args.db_path)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 49483f43..a80cf09a 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -14,6 +14,9 @@ from typing import Dict, Any from urllib.parse import urljoin import requests from requests.exceptions import InvalidSchema +import hashlib +from typing import Optional, Tuple, Dict, Any +import xxhash class InvalidCSSSelectorError(Exception): pass @@ -1109,3 +1112,27 @@ def clean_tokens(tokens: list[str]) -> list[str]: and not token.startswith('↑') and not token.startswith('▲') and not token.startswith('⬆')] + + +def generate_content_hash(content: str) -> str: + """Generate a unique hash for content""" + return xxhash.xxh64(content.encode()).hexdigest() + # return hashlib.sha256(content.encode()).hexdigest() + +def ensure_content_dirs(base_path: str) -> Dict[str, str]: + """Create content directories if they don't exist""" + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots' + } + + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + + return content_paths \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e83643b3..94f741ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 tf-playwright-stealth~=1.0 +xxhash~=3.4 diff --git a/setup.py b/setup.py index 93190291..d3145ac1 100644 --- a/setup.py +++ b/setup.py @@ -5,34 +5,37 @@ from pathlib import Path import shutil import subprocess import sys +import asyncio -# Create the .crawl4ai folder in the user's home directory if it doesn't exist -# If the folder already exists, remove the cache folder +# Create the .crawl4ai folder structure crawl4ai_folder = Path.home() / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" +content_folders = ['html_content', 'cleaned_html', 'markdown_content', + 'extracted_content', 'screenshots'] +# Clean up old cache if exists if cache_folder.exists(): shutil.rmtree(cache_folder) +# Create new folder structure crawl4ai_folder.mkdir(exist_ok=True) cache_folder.mkdir(exist_ok=True) +for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) -# Read the requirements from requirements.txt +# Read requirements and version __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() -# Read version from __init__.py with open("crawl4ai/_version.py") as f: for line in f: if line.startswith("__version__"): version = line.split("=")[1].strip().strip('"') break -# Define the requirements for different environments +# Define requirements default_requirements = requirements -# torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"] -# transformer_requirements = ["transformers", "tokenizers", "onnxruntime"] torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk" ] @@ -50,10 +53,24 @@ def install_playwright(): print(f"Unexpected error during Playwright installation: {e}") print("Please run 'python -m playwright install' manually after the installation.") +def run_migration(): + """Initialize database during installation""" + try: + print("Starting database initialization...") + from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) + print("Database initialization completed successfully.") + except ImportError: + print("Warning: Database module not found. Will initialize on first use.") + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() + run_migration() setup( name="Crawl4AI", @@ -66,7 +83,7 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright"], # Add playwright to default requirements + install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, @@ -77,6 +94,7 @@ setup( entry_points={ 'console_scripts': [ 'crawl4ai-download-models=crawl4ai.model_loader:main', + 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command ], }, classifiers=[ From 509844208617673ee4cd066a4386a6c76fdadf91 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 15:30:24 +0800 Subject: [PATCH 11/50] refactor: migrate versioning to __version__.py and remove deprecated _version.py --- crawl4ai/__init__.py | 2 +- crawl4ai/{_version.py => __version__.py} | 0 crawl4ai/async_webcrawler.py | 2 +- crawl4ai/content_filter_strategy.py | 5 ----- middlewares.py | 0 requirements-dev.txt | 5 ----- requirements.txt | 2 ++ setup.py | 2 +- 8 files changed, 5 insertions(+), 13 deletions(-) rename crawl4ai/{_version.py => __version__.py} (100%) delete mode 100644 middlewares.py delete mode 100644 requirements-dev.txt diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 1bcc491c..e55aaf73 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -2,7 +2,7 @@ from .async_webcrawler import AsyncWebCrawler from .models import CrawlResult -from ._version import __version__ +from .__version__ import __version__ # __version__ = "0.3.73" __all__ = [ diff --git a/crawl4ai/_version.py b/crawl4ai/__version__.py similarity index 100% rename from crawl4ai/_version.py rename to crawl4ai/__version__.py diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index febc01d3..03e7a393 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -16,7 +16,7 @@ from .utils import ( InvalidCSSSelectorError, format_html ) -from ._version import __version__ as crawl4ai_version +from .__version__ import __version__ as crawl4ai_version class AsyncWebCrawler: def __init__( diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 850ebf11..88375da9 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -1,12 +1,7 @@ -import os import re -import time from bs4 import BeautifulSoup, Tag from typing import List, Tuple, Dict from rank_bm25 import BM25Okapi -import nltk -from time import perf_counter -from html5lib import parse, treebuilders from time import perf_counter from collections import deque from bs4 import BeautifulSoup, NavigableString, Tag diff --git a/middlewares.py b/middlewares.py deleted file mode 100644 index e69de29b..00000000 diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 7bc121a4..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,5 +0,0 @@ --r requirements.txt -pytest -pytest-asyncio -selenium -setuptools diff --git a/requirements.txt b/requirements.txt index 94f741ca..74e8b3d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ requests~=2.26 beautifulsoup4~=4.12 tf-playwright-stealth~=1.0 xxhash~=3.4 +rank-bm25~=0.2 +aiofiles~=24.0 \ No newline at end of file diff --git a/setup.py b/setup.py index d3145ac1..d8ad2cd3 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() -with open("crawl4ai/_version.py") as f: +with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): version = line.split("=")[1].strip().strip('"') From 90df6921b7be573d95795907fcdebd28002dfd9b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 15:34:30 +0800 Subject: [PATCH 12/50] feat(crawl_sync): add synchronous crawl endpoint and corresponding test --- docs/examples/docker_example.py | 21 +++++++++++++++++++++ main.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index c22acd55..502f1e52 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -33,6 +33,13 @@ class Crawl4AiTester: return status time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() def test_docker_deployment(version="basic"): tester = Crawl4AiTester() @@ -54,6 +61,7 @@ def test_docker_deployment(version="basic"): # Test cases based on version test_basic_crawl(tester) + test_basic_crawl_sync(tester) # if version in ["full", "transformer"]: # test_cosine_extraction(tester) @@ -78,6 +86,19 @@ def test_basic_crawl(tester: Crawl4AiTester): assert result["result"]["success"] assert len(result["result"]["markdown"]) > 0 +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { diff --git a/main.py b/main.py index a5da029c..660c3366 100644 --- a/main.py +++ b/main.py @@ -375,6 +375,30 @@ async def get_task_status(task_id: str): return response +@app.post("/crawl_sync") +async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: + task_id = await crawler_service.submit_task(request) + + # Wait up to 60 seconds for task completion + for _ in range(60): + task_info = crawler_service.task_manager.get_task(task_id) + if not task_info: + raise HTTPException(status_code=404, detail="Task not found") + + if task_info.status == TaskStatus.COMPLETED: + # Return same format as /task/{task_id} endpoint + if isinstance(task_info.result, list): + return {"status": task_info.status, "results": [result.dict() for result in task_info.result]} + return {"status": task_info.status, "result": task_info.result.dict()} + + if task_info.status == TaskStatus.FAILED: + raise HTTPException(status_code=500, detail=task_info.error) + + await asyncio.sleep(1) + + # If we get here, task didn't complete within timeout + raise HTTPException(status_code=408, detail="Task timed out") + @app.get("/health") async def health_check(): available_slots = await crawler_service.resource_monitor.get_available_slots() From e62c80729559457c937b9740cb3bab960e6103d3 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 16:38:13 +0800 Subject: [PATCH 13/50] feat(deploy): add Railway deployment configuration and setup instructions --- deploy/railway/README.md | 19 +++++++++++++++++++ deploy/railway/button.json | 33 +++++++++++++++++++++++++++++++++ deploy/railway/railway.toml | 18 ++++++++++++++++++ 3 files changed, 70 insertions(+) create mode 100644 deploy/railway/README.md create mode 100644 deploy/railway/button.json create mode 100644 deploy/railway/railway.toml diff --git a/deploy/railway/README.md b/deploy/railway/README.md new file mode 100644 index 00000000..155e7642 --- /dev/null +++ b/deploy/railway/README.md @@ -0,0 +1,19 @@ +# Railway Deployment + +## Quick Deploy +[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/crawl4ai) + +## Manual Setup +1. Fork this repository +2. Create a new Railway project +3. Configure environment variables: + - `INSTALL_TYPE`: basic or all + - `ENABLE_GPU`: true/false +4. Deploy! + +## Configuration +See `railway.toml` for: +- Memory limits +- Health checks +- Restart policies +- Scaling options \ No newline at end of file diff --git a/deploy/railway/button.json b/deploy/railway/button.json new file mode 100644 index 00000000..1fc52167 --- /dev/null +++ b/deploy/railway/button.json @@ -0,0 +1,33 @@ +{ + "name": "Crawl4AI", + "description": "LLM Friendly Web Crawler & Scraper", + "render": { + "dockerfile": { + "path": "Dockerfile" + } + }, + "env": [ + { + "key": "INSTALL_TYPE", + "description": "Installation type (basic/all)", + "default": "basic", + "required": true + }, + { + "key": "ENABLE_GPU", + "description": "Enable GPU support", + "default": "false", + "required": false + } + ], + "services": [ + { + "name": "web", + "dockerfile": "./Dockerfile", + "healthcheck": { + "path": "/health", + "port": 11235 + } + } + ] + } \ No newline at end of file diff --git a/deploy/railway/railway.toml b/deploy/railway/railway.toml new file mode 100644 index 00000000..f24d8fab --- /dev/null +++ b/deploy/railway/railway.toml @@ -0,0 +1,18 @@ +# railway.toml +[build] +builder = "DOCKERFILE" +dockerfilePath = "Dockerfile" + +[deploy] +startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT" +healthcheckPath = "/health" +restartPolicyType = "ON_FAILURE" +restartPolicyMaxRetries = 3 + +[deploy.memory] +soft = 2048 # 2GB min for Playwright +hard = 4096 # 4GB max + +[deploy.scaling] +min = 1 +max = 1 From f77f06a3bd4c1ef6e45b69a64959b55164bf4512 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 16:43:31 +0800 Subject: [PATCH 14/50] feat(deploy): add deployment configuration and templates for crawl4ai --- .do/app.yaml | 19 +++++++++++++++++++ .do/deploy.template.yaml | 22 ++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 .do/app.yaml create mode 100644 .do/deploy.template.yaml diff --git a/.do/app.yaml b/.do/app.yaml new file mode 100644 index 00000000..7e11aab7 --- /dev/null +++ b/.do/app.yaml @@ -0,0 +1,19 @@ +alerts: +- rule: DEPLOYMENT_FAILED +- rule: DOMAIN_FAILED +name: crawl4ai +region: nyc +services: +- dockerfile_path: Dockerfile + github: + branch: main + deploy_on_push: true + repo: unclecode/crawl4ai + health_check: + http_path: /health + http_port: 11235 + instance_count: 1 + instance_size_slug: basic-xs + name: web + routes: + - path: / \ No newline at end of file diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml new file mode 100644 index 00000000..ab76795d --- /dev/null +++ b/.do/deploy.template.yaml @@ -0,0 +1,22 @@ +spec: + name: crawl4ai + services: + - name: crawl4ai + git: + branch: main + repo_clone_url: https://github.com/unclecode/crawl4ai.git + dockerfile_path: Dockerfile + http_port: 11235 + instance_count: 1 + instance_size_slug: basic-xs + health_check: + http_path: /health + envs: + - key: INSTALL_TYPE + value: "basic" + - key: PYTHON_VERSION + value: "3.10" + - key: ENABLE_GPU + value: "false" + routes: + - path: / \ No newline at end of file From fca1319b7d1c3e3da5b07898d3890bced4a7719e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:10:30 +0800 Subject: [PATCH 15/50] feat(docker): add MkDocs installation and build step for documentation --- Dockerfile | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 125fb9b8..54ac641c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -107,13 +107,19 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ pip install -e "." ; \ fi + # Install MkDocs and required plugins +RUN pip install --no-cache-dir \ + mkdocs \ + mkdocs-material \ + mkdocs-terminal \ + pymdown-extensions + +# Build MkDocs documentation +RUN mkdocs build + # Install Playwright and browsers RUN playwright install -# Health check -HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - # Expose port EXPOSE 8000 11235 9222 8080 From 6f2fe5954f6ce9f7f17fb15802054cd6c5802123 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:12:41 +0800 Subject: [PATCH 16/50] feat(deploy): update instance size to professional-xs and add memory utilization alert --- .do/deploy.template.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index ab76795d..c7db5e7e 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -8,7 +8,7 @@ spec: dockerfile_path: Dockerfile http_port: 11235 instance_count: 1 - instance_size_slug: basic-xs + instance_size_slug: professional-xs # 4GB RAM, 2 vCPUs health_check: http_path: /health envs: @@ -19,4 +19,7 @@ spec: - key: ENABLE_GPU value: "false" routes: - - path: / \ No newline at end of file + - path: / + alerts: + - rule: MEM_UTILIZATION + value: 90 # Alert at 90% memory usage \ No newline at end of file From 6b569cceb5332ea481190a86086fbf934c7c89e7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:21:45 +0800 Subject: [PATCH 17/50] feat(deploy): update branch to 0.3.74 and change instance size to basic-xs --- .do/app.yaml | 2 +- .do/deploy.template.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.do/app.yaml b/.do/app.yaml index 7e11aab7..bff8ff97 100644 --- a/.do/app.yaml +++ b/.do/app.yaml @@ -6,7 +6,7 @@ region: nyc services: - dockerfile_path: Dockerfile github: - branch: main + branch: 0.3.74 deploy_on_push: true repo: unclecode/crawl4ai health_check: diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index c7db5e7e..73ee7c5c 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -3,12 +3,12 @@ spec: services: - name: crawl4ai git: - branch: main + branch: 0.3.74 repo_clone_url: https://github.com/unclecode/crawl4ai.git dockerfile_path: Dockerfile http_port: 11235 instance_count: 1 - instance_size_slug: professional-xs # 4GB RAM, 2 vCPUs + instance_size_slug: basic-xs # 4GB RAM, 2 vCPUs health_check: http_path: /health envs: From 67edc2d641a672e9fa5a95fa4341407b9e574851 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:23:32 +0800 Subject: [PATCH 18/50] feat(deploy): update instance size to professional-xs and add memory utilization alert parameters --- .do/deploy.template.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index 73ee7c5c..49d0012b 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -8,7 +8,7 @@ spec: dockerfile_path: Dockerfile http_port: 11235 instance_count: 1 - instance_size_slug: basic-xs # 4GB RAM, 2 vCPUs + instance_size_slug: professional-xs health_check: http_path: /health envs: @@ -22,4 +22,9 @@ spec: - path: / alerts: - rule: MEM_UTILIZATION - value: 90 # Alert at 90% memory usage \ No newline at end of file + value: 90 + window: 5m # Added window parameter + operator: GREATER_THAN + disabled: false + - rule: DEPLOYMENT_FAILED + - rule: DOMAIN_FAILED \ No newline at end of file From 5d0b13294cfec45c55b07a5593726335c79b6cde Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:25:07 +0800 Subject: [PATCH 19/50] feat(deploy): change instance size to professional-xs and update memory utilization alert window to 300 seconds --- .do/app.yaml | 2 +- .do/deploy.template.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.do/app.yaml b/.do/app.yaml index bff8ff97..00d7b781 100644 --- a/.do/app.yaml +++ b/.do/app.yaml @@ -13,7 +13,7 @@ services: http_path: /health http_port: 11235 instance_count: 1 - instance_size_slug: basic-xs + instance_size_slug: professional-xs name: web routes: - path: / \ No newline at end of file diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index 49d0012b..dcd9b2d7 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -23,7 +23,7 @@ spec: alerts: - rule: MEM_UTILIZATION value: 90 - window: 5m # Added window parameter + window: 300 # Changed from "5m" to 300 (5 minutes in seconds) operator: GREATER_THAN disabled: false - rule: DEPLOYMENT_FAILED From 79feab89c4236e7de180ec4cd2257df3f5f3e386 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:28:42 +0800 Subject: [PATCH 20/50] refactor(deploy): remove memory utilization alert configuration from deployment template --- .do/deploy.template.yaml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml index dcd9b2d7..9a06a366 100644 --- a/.do/deploy.template.yaml +++ b/.do/deploy.template.yaml @@ -19,12 +19,4 @@ spec: - key: ENABLE_GPU value: "false" routes: - - path: / - alerts: - - rule: MEM_UTILIZATION - value: 90 - window: 300 # Changed from "5m" to 300 (5 minutes in seconds) - operator: GREATER_THAN - disabled: false - - rule: DEPLOYMENT_FAILED - - rule: DOMAIN_FAILED \ No newline at end of file + - path: / \ No newline at end of file From 1961adb530baf74dfec16a0f2da795946855459a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 17:35:27 +0800 Subject: [PATCH 21/50] refactor(docker): remove shared memory size configuration to streamline Dockerfile --- Dockerfile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 54ac641c..ba29faf1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -123,10 +123,5 @@ RUN playwright install # Expose port EXPOSE 8000 11235 9222 8080 -# Optional: Increase shared memory size to prevent browser crashes -# when loading heavy pages -RUN mkdir /dev/shm -VOLUME /dev/shm - # Start the FastAPI server CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"] \ No newline at end of file From 6360d0545ac2812687a1a9a31de95fa64f600ed4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:08:56 +0800 Subject: [PATCH 22/50] feat(api): add API token authentication and update Dockerfile description --- Dockerfile | 3 ++- docker-compose.yml | 33 +++++++++++++++++++++++++++++++++ docs/examples/docker_example.py | 15 ++++++++++----- main.py | 23 ++++++++++++++++++++--- 4 files changed, 65 insertions(+), 9 deletions(-) create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile index ba29faf1..76b4e1cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ ARG ENABLE_GPU=false # Platform-specific labels LABEL maintainer="unclecode" -LABEL description="Crawl4AI - Advanced Web Crawler with AI capabilities" +LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" LABEL version="1.0" # Environment setup @@ -79,6 +79,7 @@ COPY . . RUN pip install --no-cache-dir -r requirements.txt # Install required library for FastAPI +RUN pip install . RUN pip install fastapi uvicorn psutil # Install ML dependencies first for better layer caching diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..ef0dc9e4 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,33 @@ +version: '3.8' + +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 502f1e52..6701f6ac 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -7,12 +7,14 @@ import os from typing import Dict, Any class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: # Submit crawl job - response = requests.post(f"{self.base_url}/crawl", json=request_data) + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) task_id = response.json()["task_id"] print(f"Task ID: {task_id}") @@ -22,7 +24,7 @@ class Crawl4AiTester: if time.time() - start_time > timeout: raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") - result = requests.get(f"{self.base_url}/task/{task_id}") + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) status = result.json() if status["status"] == "failed": @@ -35,14 +37,17 @@ class Crawl4AiTester: time.sleep(2) def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: - response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60) + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) if response.status_code == 408: raise TimeoutError("Task did not complete within server timeout") response.raise_for_status() return response.json() def test_docker_deployment(version="basic"): - tester = Crawl4AiTester() + tester = Crawl4AiTester( + # base_url="http://localhost:11235" + base_url="https://crawl4ai-sby74.ondigitalocean.app" + ) print(f"Testing Crawl4AI Docker {version} version") # Health check with timeout and retry diff --git a/main.py b/main.py index 660c3366..92b1793b 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,8 @@ from fastapi.exceptions import RequestValidationError from starlette.middleware.base import BaseHTTPMiddleware from starlette.responses import FileResponse from fastapi.responses import RedirectResponse +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from fastapi import Depends, Security from pydantic import BaseModel, HttpUrl, Field from typing import Optional, List, Dict, Any, Union @@ -322,6 +324,21 @@ app.add_middleware( # Mount the pages directory as a static directory app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") +# API token security +security = HTTPBearer() +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") + +async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): + if not CRAWL4AI_API_TOKEN: + return credentials # No token verification if CRAWL4AI_API_TOKEN is not set + if credentials.credentials != CRAWL4AI_API_TOKEN: + raise HTTPException(status_code=401, detail="Invalid token") + return credentials + +# Helper function to conditionally apply security +def secure_endpoint(): + return Depends(verify_token) if CRAWL4AI_API_TOKEN else None + # Check if site directory exists if os.path.exists(__location__ + "/site"): # Mount the site directory as a static directory @@ -348,12 +365,12 @@ def read_root(): return {"message": "Crawl4AI API service is running"} -@app.post("/crawl") +@app.post("/crawl", dependencies=[Depends(verify_token)]) async def crawl(request: CrawlRequest) -> Dict[str, str]: task_id = await crawler_service.submit_task(request) return {"task_id": task_id} -@app.get("/task/{task_id}") +@app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) async def get_task_status(task_id: str): task_info = crawler_service.task_manager.get_task(task_id) if not task_info: @@ -375,7 +392,7 @@ async def get_task_status(task_id: str): return response -@app.post("/crawl_sync") +@app.post("/crawl_sync", dependencies=[Depends(verify_token)]) async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: task_id = await crawler_service.submit_task(request) From 9139ef3125b8a0bc96e2b26f3a06b09ecc60c020 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:19:44 +0800 Subject: [PATCH 23/50] feat(docker): update Dockerfile for improved installation process and enhance deployment documentation with Docker Compose setup and API token security --- Dockerfile | 9 +- docs/md_v2/basic/docker-deploymeny.md | 123 +++++++++++++++++--------- 2 files changed, 86 insertions(+), 46 deletions(-) diff --git a/Dockerfile b/Dockerfile index 76b4e1cf..aac2280a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,7 +79,6 @@ COPY . . RUN pip install --no-cache-dir -r requirements.txt # Install required library for FastAPI -RUN pip install . RUN pip install fastapi uvicorn psutil # Install ML dependencies first for better layer caching @@ -97,15 +96,15 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ # Install the package RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ - pip install -e ".[all]" && \ + pip install ".[all]" && \ python -m crawl4ai.model_loader ; \ elif [ "$INSTALL_TYPE" = "torch" ] ; then \ - pip install -e ".[torch]" ; \ + pip install ".[torch]" ; \ elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ - pip install -e ".[transformer]" && \ + pip install ".[transformer]" && \ python -m crawl4ai.model_loader ; \ else \ - pip install -e "." ; \ + pip install "." ; \ fi # Install MkDocs and required plugins diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index cc11d0d9..a500ee21 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -1,71 +1,112 @@ -# Docker Deployment +# Docker Deployment 🐳 Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. -## Quick Start 🚀 +## Docker Compose Setup 🐳 -Pull and run the basic version: +### Basic Usage -```bash -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic +Create a `docker-compose.yml`: +```yaml +version: '3.8' + +services: + crawl4ai: + image: unclecode/crawl4ai:all + ports: + - "11235:11235" + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + restart: unless-stopped ``` -Test the deployment: +Run with: +```bash +docker-compose up -d +``` + +### Secure Mode with API Token + +To enable API authentication, simply set the `CRAWL4AI_API_TOKEN`: +```bash +CRAWL4AI_API_TOKEN=your-secret-token docker-compose up -d +``` + +### Using Environment Variables + +Create a `.env` file for your API tokens: +```env +# Crawl4AI API Security (optional) +CRAWL4AI_API_TOKEN=your-secret-token + +# LLM Provider API Keys +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... +GOOGLE_API_KEY=... +GEMINI_API_KEY=... +OLLAMA_API_KEY=... + +# Additional Configuration +MAX_CONCURRENT_TASKS=5 +``` + +Docker Compose will automatically load variables from the `.env` file. No additional configuration needed! + +### Testing with API Token + ```python import requests -# Test health endpoint -health = requests.get("http://localhost:11235/health") -print("Health check:", health.json()) +# Initialize headers with token if using secure mode +headers = {} +if api_token := os.getenv('CRAWL4AI_API_TOKEN'): + headers['Authorization'] = f'Bearer {api_token}' -# Test basic crawl +# Test crawl with authentication response = requests.post( "http://localhost:11235/crawl", + headers=headers, json={ "urls": "https://www.nbcnews.com/business", "priority": 10 } ) task_id = response.json()["task_id"] -print("Task ID:", task_id) ``` -## Available Images 🏷️ +### Security Best Practices 🔒 -- `unclecode/crawl4ai:basic` - Basic web crawling capabilities -- `unclecode/crawl4ai:all` - Full installation with all features -- `unclecode/crawl4ai:gpu` - GPU-enabled version for ML features - -## Configuration Options 🔧 - -### Environment Variables - -```bash -docker run -p 11235:11235 \ - -e MAX_CONCURRENT_TASKS=5 \ - -e OPENAI_API_KEY=your_key \ - unclecode/crawl4ai:all +- Add `.env` to your `.gitignore` +- Use different API tokens for development and production +- Rotate API tokens periodically +- Use secure methods to pass tokens in production environments ``` -### Volume Mounting +This addition to your documentation: +1. Shows how to use Docker Compose +2. Explains both secure and non-secure modes +3. Demonstrates environment variable configuration +4. Provides example code for authenticated requests +5. Includes security best practices + + + + + + + + + + + + -Mount a directory for persistent data: -```bash -docker run -p 11235:11235 \ - -v $(pwd)/data:/app/data \ - unclecode/crawl4ai:all -``` -### Resource Limits -Control container resources: -```bash -docker run -p 11235:11235 \ - --memory=4g \ - --cpus=2 \ - unclecode/crawl4ai:all -``` ## Usage Examples 📝 From 4b45b28f256ad62272d5ea75ae898de7882618ba Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:44:47 +0800 Subject: [PATCH 24/50] feat(docs): enhance deployment documentation with one-click setup, API security details, and Docker Compose examples --- README.md | 15 ++ docs/examples/docker_example.py | 13 +- docs/md_v2/basic/docker-deploymeny.md | 230 ++++++++++++++++++++------ main.py | 3 + 4 files changed, 207 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index d250f936..a2806304 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,21 @@ cd crawl4ai pip install -e . ``` +## One-Click Deployment 🚀 + +Deploy your own instance of Crawl4AI with one click: + +[![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge) + + +> 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation. + +The deploy will: +- Set up a Docker container with Crawl4AI +- Configure Playwright and all dependencies +- Start the FastAPI server on port 11235 +- Set up health checks and auto-deployment + ### Using Docker 🐳 Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 6701f6ac..b43e8ee6 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -15,6 +15,8 @@ class Crawl4AiTester: def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: # Submit crawl job response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") task_id = response.json()["task_id"] print(f"Task ID: {task_id}") @@ -45,8 +47,9 @@ class Crawl4AiTester: def test_docker_deployment(version="basic"): tester = Crawl4AiTester( - # base_url="http://localhost:11235" - base_url="https://crawl4ai-sby74.ondigitalocean.app" + base_url="http://localhost:11235" , + # base_url="https://crawl4ai-sby74.ondigitalocean.app", + api_token="test" ) print(f"Testing Crawl4AI Docker {version} version") @@ -83,7 +86,8 @@ def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") request = { "urls": "https://www.nbcnews.com/business", - "priority": 10 + "priority": 10, + "session_id": "test" } result = tester.submit_and_wait(request) @@ -95,7 +99,8 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl (Sync) ===") request = { "urls": "https://www.nbcnews.com/business", - "priority": 10 + "priority": 10, + "session_id": "test" } result = tester.submit_sync(request) diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index a500ee21..30555708 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -1,12 +1,115 @@ -# Docker Deployment 🐳 +# Docker Deployment Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. -## Docker Compose Setup 🐳 +## Quick Start 🚀 -### Basic Usage +Pull and run the basic version: + +```bash +# Basic run without security +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic + +# Run with API security enabled +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic +``` + +## API Security 🔒 + +### Understanding CRAWL4AI_API_TOKEN + +The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance: + +- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication +- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible + +```bash +# Secured Instance +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all + +# Unsecured Instance +docker run -p 11235:11235 unclecode/crawl4ai:all +``` + +### Making API Calls + +For secured instances, include the token in all requests: + +```python +import requests + +# Setup headers if token is being used +api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN +headers = {"Authorization": f"Bearer {api_token}"} if api_token else {} + +# Making authenticated requests +response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://example.com", + "priority": 10 + } +) + +# Checking task status +task_id = response.json()["task_id"] +status = requests.get( + f"http://localhost:11235/task/{task_id}", + headers=headers +) +``` + +### Using with Docker Compose + +In your `docker-compose.yml`: +```yaml +services: + crawl4ai: + image: unclecode/crawl4ai:all + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional + # ... other configuration +``` + +Then either: +1. Set in `.env` file: +```env +CRAWL4AI_API_TOKEN=your_secret_token +``` + +2. Or set via command line: +```bash +CRAWL4AI_API_TOKEN=your_secret_token docker-compose up +``` + +> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`). + +## Configuration Options 🔧 + +### Environment Variables + +You can configure the service using environment variables: + +```bash +# Basic configuration +docker run -p 11235:11235 \ + -e MAX_CONCURRENT_TASKS=5 \ + unclecode/crawl4ai:all + +# With security and LLM support +docker run -p 11235:11235 \ + -e CRAWL4AI_API_TOKEN=your_secret_token \ + -e OPENAI_API_KEY=sk-... \ + -e ANTHROPIC_API_KEY=sk-ant-... \ + unclecode/crawl4ai:all +``` + +### Using Docker Compose (Recommended) 🐳 Create a `docker-compose.yml`: + ```yaml version: '3.8' @@ -15,83 +118,110 @@ services: image: unclecode/crawl4ai:all ports: - "11235:11235" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security + - MAX_CONCURRENT_TASKS=5 + # LLM Provider Keys + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} volumes: - /dev/shm:/dev/shm deploy: resources: limits: memory: 4G - restart: unless-stopped + reservations: + memory: 1G ``` -Run with: +You can run it in two ways: + +1. Using environment variables directly: ```bash -docker-compose up -d +CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up ``` -### Secure Mode with API Token - -To enable API authentication, simply set the `CRAWL4AI_API_TOKEN`: -```bash -CRAWL4AI_API_TOKEN=your-secret-token docker-compose up -d -``` - -### Using Environment Variables - -Create a `.env` file for your API tokens: +2. Using a `.env` file (recommended): +Create a `.env` file in the same directory: ```env -# Crawl4AI API Security (optional) -CRAWL4AI_API_TOKEN=your-secret-token +# API Security (optional) +CRAWL4AI_API_TOKEN=your_secret_token -# LLM Provider API Keys +# LLM Provider Keys OPENAI_API_KEY=sk-... ANTHROPIC_API_KEY=sk-ant-... -GOOGLE_API_KEY=... -GEMINI_API_KEY=... -OLLAMA_API_KEY=... -# Additional Configuration +# Other Configuration MAX_CONCURRENT_TASKS=5 ``` -Docker Compose will automatically load variables from the `.env` file. No additional configuration needed! +Then simply run: +```bash +docker-compose up +``` -### Testing with API Token +### Testing the Deployment 🧪 ```python import requests -# Initialize headers with token if using secure mode -headers = {} -if api_token := os.getenv('CRAWL4AI_API_TOKEN'): - headers['Authorization'] = f'Bearer {api_token}' +# For unsecured instances +def test_unsecured(): + # Health check + health = requests.get("http://localhost:11235/health") + print("Health check:", health.json()) -# Test crawl with authentication -response = requests.post( - "http://localhost:11235/crawl", - headers=headers, - json={ - "urls": "https://www.nbcnews.com/business", - "priority": 10 + # Basic crawl + response = requests.post( + "http://localhost:11235/crawl", + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) + +# For secured instances +def test_secured(api_token): + headers = {"Authorization": f"Bearer {api_token}"} + + # Basic crawl with authentication + response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) +``` + +### LLM Extraction Example 🤖 + +When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction: + +```python +request = { + "urls": "https://example.com", + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4", + "instruction": "Extract main topics from the page" + } } -) -task_id = response.json()["task_id"] +} + +# Make the request (add headers if using API security) +response = requests.post("http://localhost:11235/crawl", json=request) ``` -### Security Best Practices 🔒 +> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! -- Add `.env` to your `.gitignore` -- Use different API tokens for development and production -- Rotate API tokens periodically -- Use secure methods to pass tokens in production environments -``` -This addition to your documentation: -1. Shows how to use Docker Compose -2. Explains both secure and non-secure modes -3. Demonstrates environment variable configuration -4. Provides example code for authenticated requests -5. Includes security best practices diff --git a/main.py b/main.py index 92b1793b..41788d61 100644 --- a/main.py +++ b/main.py @@ -65,6 +65,7 @@ class CrawlRequest(BaseModel): screenshot: bool = False magic: bool = False extra: Optional[Dict[str, Any]] = {} + session_id: Optional[str] = None @dataclass class TaskInfo: @@ -284,6 +285,7 @@ class CrawlerService: css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, **request.extra, ) else: @@ -295,6 +297,7 @@ class CrawlerService: css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, **request.extra, ) From 3a66aa8a60ae7213bb8437003b58a631df208ffb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 15:30:56 +0800 Subject: [PATCH 25/50] feat(cache): introduce CacheMode and CacheContext for enhanced caching behavior chore(requirements): add colorama dependency refactor(config): add SHOW_DEPRECATION_WARNINGS flag and clean up code fix(docs): update example scripts for clarity and consistency --- crawl4ai/__init__.py | 3 +- crawl4ai/async_crawler_strategy.py | 8 +- crawl4ai/async_webcrawler.3.73.py | 344 +++++++++++++++++++++++++++++ crawl4ai/async_webcrawler.py | 285 +++++++++++++++++------- crawl4ai/cache_context.py | 79 +++++++ crawl4ai/config.py | 3 +- docs/examples/docker_example.py | 5 +- docs/examples/quickstart_async.py | 12 +- requirements.txt | 3 +- tests/docker_example.py | 332 ++++++++++++++++++++++++++++ 10 files changed, 979 insertions(+), 95 deletions(-) create mode 100644 crawl4ai/async_webcrawler.3.73.py create mode 100644 crawl4ai/cache_context.py create mode 100644 tests/docker_example.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index e55aaf73..ad9475b4 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,6 +1,6 @@ # __init__.py -from .async_webcrawler import AsyncWebCrawler +from .async_webcrawler import AsyncWebCrawler, CacheMode from .models import CrawlResult from .__version__ import __version__ # __version__ = "0.3.73" @@ -8,6 +8,7 @@ from .__version__ import __version__ __all__ = [ "AsyncWebCrawler", "CrawlResult", + "CacheMode", ] def is_sync_version_installed(): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 83933a35..a67591af 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -669,8 +669,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.accept_downloads: page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) - if self.verbose: - print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") + # if self.verbose: + # print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") if self.use_cached_html: cache_file_path = os.path.join( @@ -873,8 +873,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await asyncio.sleep(screenshot_wait_for) screenshot_data = await self.take_screenshot(page) - if self.verbose: - print(f"[LOG] ✅ Crawled {url} successfully!") + # if self.verbose: + # print(f"[LOG] ✅ Crawled {url} successfully!") if self.use_cached_html: cache_file_path = os.path.join( diff --git a/crawl4ai/async_webcrawler.3.73.py b/crawl4ai/async_webcrawler.3.73.py new file mode 100644 index 00000000..03e7a393 --- /dev/null +++ b/crawl4ai/async_webcrawler.3.73.py @@ -0,0 +1,344 @@ +import os +import time +from pathlib import Path +from typing import Optional +import json +import asyncio +from .models import CrawlResult +from .async_database import async_db_manager +from .chunking_strategy import * +from .extraction_strategy import * +from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse +from .content_scrapping_strategy import WebScrapingStrategy +from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD +from .utils import ( + sanitize_input_encode, + InvalidCSSSelectorError, + format_html +) +from .__version__ import __version__ as crawl4ai_version + +class AsyncWebCrawler: + def __init__( + self, + crawler_strategy: Optional[AsyncCrawlerStrategy] = None, + always_by_pass_cache: bool = False, + base_directory: str = str(Path.home()), + **kwargs, + ): + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( + **kwargs + ) + self.always_by_pass_cache = always_by_pass_cache + # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") + os.makedirs(self.crawl4ai_folder, exist_ok=True) + os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) + self.ready = False + self.verbose = kwargs.get("verbose", False) + + async def __aenter__(self): + await self.crawler_strategy.__aenter__() + await self.awarmup() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) + + async def awarmup(self): + # Print a message for crawl4ai and its version + if self.verbose: + print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") + print("[LOG] 🌤️ Warming up the AsyncWebCrawler") + # await async_db_manager.ainit_db() + # # await async_db_manager.initialize() + # await self.arun( + # url="https://google.com/", + # word_count_threshold=5, + # bypass_cache=False, + # verbose=False, + # ) + self.ready = True + if self.verbose: + print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") + + async def arun( + self, + url: str, + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + user_agent: str = None, + verbose=True, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + **kwargs, + ) -> CrawlResult: + """ + Runs the crawler for a single source: URL (web, local file, or raw HTML). + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + CrawlResult: The result of the crawling and processing. + """ + try: + if disable_cache: + bypass_cache = True + no_cache_read = True + no_cache_write = True + + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") + + word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) + + async_response: AsyncCrawlResponse = None + cached = None + screenshot_data = None + extracted_content = None + + is_web_url = url.startswith(('http://', 'https://')) + is_local_file = url.startswith("file://") + is_raw_html = url.startswith("raw:") + _url = url if not is_raw_html else "Raw HTML" + + start_time = time.perf_counter() + cached_result = None + if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: + cached_result = await async_db_manager.aget_cached_url(url) + + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") + if screenshot: + screenshot_data = cached_result.screenshot + if not screenshot_data: + cached_result = None + if verbose: + print( + f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) + + + if not cached or not html: + t1 = time.perf_counter() + + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) + html = sanitize_input_encode(async_response.html) + screenshot_data = async_response.screenshot + t2 = time.perf_counter() + if verbose: + print( + f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" + ) + + t1 = time.perf_counter() + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + css_selector=css_selector, + screenshot=screenshot_data, + verbose=verbose, + is_cached=bool(cached), + async_response=async_response, + bypass_cache=bypass_cache, + is_web_url = is_web_url, + is_local_file = is_local_file, + is_raw_html = is_raw_html, + **kwargs, + ) + + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached_result.response_headers if cached_result else {} + + crawl_result.success = bool(html) + crawl_result.session_id = kwargs.get("session_id", None) + + if verbose: + print( + f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" + ) + + if not is_raw_html and not no_cache_write: + if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url(crawl_result) + + + return crawl_result + + except Exception as e: + if not hasattr(e, "msg"): + e.msg = str(e) + print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") + return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) + + async def arun_many( + self, + urls: List[str], + word_count_threshold=MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = RegexChunking(), + bypass_cache: bool = False, + css_selector: str = None, + screenshot: bool = False, + user_agent: str = None, + verbose=True, + **kwargs, + ) -> List[CrawlResult]: + """ + Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). + + Args: + urls (List[str]): A list of URLs with supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + ... [other existing parameters] + + Returns: + List[CrawlResult]: The results of the crawling and processing. + """ + semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + async with semaphore: + return await self.arun( + url, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + bypass_cache=bypass_cache, + css_selector=css_selector, + screenshot=screenshot, + user_agent=user_agent, + verbose=verbose, + **kwargs, + ) + + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + return [result if not isinstance(result, Exception) else str(result) for result in results] + + async def aprocess_html( + self, + url: str, + html: str, + extracted_content: str, + word_count_threshold: int, + extraction_strategy: ExtractionStrategy, + chunking_strategy: ChunkingStrategy, + css_selector: str, + screenshot: str, + verbose: bool, + **kwargs, + ) -> CrawlResult: + t = time.perf_counter() + # Extract content from HTML + try: + _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" + t1 = time.perf_counter() + scrapping_strategy = WebScrapingStrategy() + # result = await scrapping_strategy.ascrap( + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=word_count_threshold, + css_selector=css_selector, + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get( + "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + ), + **kwargs, + ) + + if result is None: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") + except InvalidCSSSelectorError as e: + raise ValueError(str(e)) + except Exception as e: + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + markdown = sanitize_input_encode(result.get("markdown", "")) + fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) + fit_html = sanitize_input_encode(result.get("fit_html", "")) + media = result.get("media", []) + links = result.get("links", []) + metadata = result.get("metadata", {}) + + if verbose: + print( + f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" + ) + + if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): + t1 = time.perf_counter() + # Check if extraction strategy is type of JsonCssExtractionStrategy + if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): + extraction_strategy.verbose = verbose + extracted_content = extraction_strategy.run(url, [html]) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + else: + sections = chunking_strategy.chunk(markdown) + extracted_content = extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + if verbose: + print( + f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" + ) + + screenshot = None if not screenshot else screenshot + + return CrawlResult( + url=url, + html=html, + cleaned_html=format_html(cleaned_html), + markdown=markdown, + fit_markdown=fit_markdown, + fit_html= fit_html, + media=media, + links=links, + metadata=metadata, + screenshot=screenshot, + extracted_content=extracted_content, + success=True, + error_message="", + ) + + async def aclear_cache(self): + # await async_db_manager.aclear_db() + await async_db_manager.cleanup() + + async def aflush_cache(self): + await async_db_manager.aflush_db() + + async def aget_cache_size(self): + return await async_db_manager.aget_total_count() + + diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 03e7a393..d554576d 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -1,7 +1,10 @@ import os import time +import warnings +from enum import Enum +from colorama import init, Fore, Back, Style from pathlib import Path -from typing import Optional +from typing import Optional, List, Union import json import asyncio from .models import CrawlResult @@ -9,8 +12,13 @@ from .async_database import async_db_manager from .chunking_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse +from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy -from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD +from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + SHOW_DEPRECATION_WARNINGS # New import +) from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, @@ -18,19 +26,77 @@ from .utils import ( ) from .__version__ import __version__ as crawl4ai_version + class AsyncWebCrawler: + """ + Asynchronous web crawler with flexible caching capabilities. + + Migration Guide (from version X.X.X): + Old way (deprecated): + crawler = AsyncWebCrawler(always_by_pass_cache=True) + result = await crawler.arun( + url="https://example.com", + bypass_cache=True, + no_cache_read=True, + no_cache_write=False + ) + + New way (recommended): + crawler = AsyncWebCrawler(always_bypass_cache=True) + result = await crawler.arun( + url="https://example.com", + cache_mode=CacheMode.WRITE_ONLY + ) + + To disable deprecation warnings: + Set SHOW_DEPRECATION_WARNINGS = False in config.py + """ + def __init__( self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, - always_by_pass_cache: bool = False, + always_bypass_cache: bool = False, + always_by_pass_cache: Optional[bool] = None, # Deprecated parameter base_directory: str = str(Path.home()), **kwargs, ): - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - **kwargs - ) - self.always_by_pass_cache = always_by_pass_cache - # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + """ + Initialize the AsyncWebCrawler. + + Args: + crawler_strategy: Strategy for crawling web pages + always_bypass_cache: Whether to always bypass cache (new parameter) + always_by_pass_cache: Deprecated, use always_bypass_cache instead + base_directory: Base directory for storing cache + """ + init() + self.log_width = 10 # Width of "[COMPLETE]" + self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") + self.log_icons = { + 'INIT': '→', # Alternative: '▶' or '►' + 'READY': '✓', # Alternative: '√' + 'FETCH': '↓', # Alternative: '▼' + 'SCRAPE': '◆', # Alternative: '♦' + 'EXTRACT': '■', # Alternative: '□' + 'COMPLETE': '●', # Alternative: '○' + 'ERROR': '×' + } + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs) + + # Handle deprecated parameter + if always_by_pass_cache is not None: + if SHOW_DEPRECATION_WARNINGS: + warnings.warn( + "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'always_bypass_cache' instead. " + "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + self.always_bypass_cache = always_by_pass_cache + else: + self.always_bypass_cache = always_bypass_cache + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) @@ -46,21 +112,13 @@ class AsyncWebCrawler: await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) async def awarmup(self): - # Print a message for crawl4ai and its version + """Initialize the crawler with warm-up sequence.""" if self.verbose: - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") - print("[LOG] 🌤️ Warming up the AsyncWebCrawler") - # await async_db_manager.ainit_db() - # # await async_db_manager.initialize() - # await self.arun( - # url="https://google.com/", - # word_count_threshold=5, - # bypass_cache=False, - # verbose=False, - # ) + print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") + print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") self.ready = True if self.verbose: - print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") + print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") async def arun( self, @@ -68,35 +126,81 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + cache_mode: Optional[CacheMode] = None, + # Deprecated parameters bypass_cache: bool = False, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + # Other parameters css_selector: str = None, screenshot: bool = False, user_agent: str = None, verbose=True, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, **kwargs, ) -> CrawlResult: """ Runs the crawler for a single source: URL (web, local file, or raw HTML). + Migration from legacy cache parameters: + Old way (deprecated): + await crawler.arun(url, bypass_cache=True, no_cache_read=True) + + New way: + await crawler.arun(url, cache_mode=CacheMode.BYPASS) + Args: - url (str): The URL to crawl. Supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] + url: The URL to crawl (http://, https://, file://, or raw:) + cache_mode: Cache behavior control (recommended) + word_count_threshold: Minimum word count threshold + extraction_strategy: Strategy for content extraction + chunking_strategy: Strategy for content chunking + css_selector: CSS selector for content extraction + screenshot: Whether to capture screenshot + user_agent: Custom user agent + verbose: Enable verbose logging + + Deprecated Args: + bypass_cache: Use cache_mode=CacheMode.BYPASS instead + disable_cache: Use cache_mode=CacheMode.DISABLED instead + no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead + no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead Returns: - CrawlResult: The result of the crawling and processing. + CrawlResult: The result of crawling and processing """ try: - if disable_cache: - bypass_cache = True - no_cache_read = True - no_cache_write = True + # Handle deprecated parameters + if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + if SHOW_DEPRECATION_WARNINGS: + warnings.warn( + "Cache control boolean flags are deprecated and will be removed in version X.X.X. " + "Use 'cache_mode' parameter instead. Examples:\n" + "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" + "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" + "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" + "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" + "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + + # Convert legacy parameters if cache_mode not provided + if cache_mode is None: + cache_mode = _legacy_to_cache_mode( + disable_cache=disable_cache, + bypass_cache=bypass_cache, + no_cache_read=no_cache_read, + no_cache_write=no_cache_write + ) + # Default to ENABLED if no cache mode specified + if cache_mode is None: + cache_mode = CacheMode.ENABLED + + # Create cache context + cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) + extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose if not isinstance(extraction_strategy, ExtractionStrategy): @@ -107,18 +211,14 @@ class AsyncWebCrawler: word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) async_response: AsyncCrawlResponse = None - cached = None + cached_result = None screenshot_data = None extracted_content = None - is_web_url = url.startswith(('http://', 'https://')) - is_local_file = url.startswith("file://") - is_raw_html = url.startswith("raw:") - _url = url if not is_raw_html else "Raw HTML" - start_time = time.perf_counter() - cached_result = None - if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: + + # Try to get cached result if appropriate + if cache_context.should_read(): cached_result = await async_db_manager.aget_cached_url(url) if cached_result: @@ -129,26 +229,27 @@ class AsyncWebCrawler: if not screenshot_data: cached_result = None if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) + print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") - if not cached or not html: + # Fetch fresh content if needed + if not cached_result or not html: t1 = time.perf_counter() if user_agent: self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( + url, + screenshot=screenshot, + **kwargs + ) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot t2 = time.perf_counter() if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" - ) + print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") - t1 = time.perf_counter() + # Process the HTML content crawl_result = await self.aprocess_html( url=url, html=html, @@ -159,15 +260,15 @@ class AsyncWebCrawler: css_selector=css_selector, screenshot=screenshot_data, verbose=verbose, - is_cached=bool(cached), + is_cached=bool(cached_result), async_response=async_response, - bypass_cache=bypass_cache, - is_web_url = is_web_url, - is_local_file = is_local_file, - is_raw_html = is_raw_html, + is_web_url=cache_context.is_web_url, + is_local_file=cache_context.is_local_file, + is_raw_html=cache_context.is_raw_html, **kwargs, ) + # Set response data if async_response: crawl_result.status_code = async_response.status_code crawl_result.response_headers = async_response.response_headers @@ -180,22 +281,26 @@ class AsyncWebCrawler: crawl_result.session_id = kwargs.get("session_id", None) if verbose: - print( - f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) + print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - if not is_raw_html and not no_cache_write: - if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url(crawl_result) + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) return crawl_result except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") - return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) + print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}") + return CrawlResult( + url=url, + html="", + markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}", + success=False, + error_message=e.msg + ) async def arun_many( self, @@ -203,6 +308,8 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + cache_mode: Optional[CacheMode] = None, + # Deprecated parameters bypass_cache: bool = False, css_selector: str = None, screenshot: bool = False, @@ -211,19 +318,35 @@ class AsyncWebCrawler: **kwargs, ) -> List[CrawlResult]: """ - Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). + Runs the crawler for multiple URLs concurrently. + + Migration from legacy parameters: + Old way (deprecated): + results = await crawler.arun_many(urls, bypass_cache=True) + + New way: + results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS) Args: - urls (List[str]): A list of URLs with supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] + urls: List of URLs to crawl + cache_mode: Cache behavior control (recommended) + [other parameters same as arun()] Returns: - List[CrawlResult]: The results of the crawling and processing. + List[CrawlResult]: Results for each URL """ - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed + if bypass_cache and SHOW_DEPRECATION_WARNINGS: + warnings.warn( + "'bypass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'cache_mode=CacheMode.BYPASS' instead. " + "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + if cache_mode is None: + cache_mode = CacheMode.BYPASS + + semaphore_count = kwargs.get('semaphore_count', 5) semaphore = asyncio.Semaphore(semaphore_count) async def crawl_with_semaphore(url): @@ -233,7 +356,7 @@ class AsyncWebCrawler: word_count_threshold=word_count_threshold, extraction_strategy=extraction_strategy, chunking_strategy=chunking_strategy, - bypass_cache=bypass_cache, + cache_mode=cache_mode, css_selector=css_selector, screenshot=screenshot, user_agent=user_agent, @@ -245,6 +368,7 @@ class AsyncWebCrawler: results = await asyncio.gather(*tasks, return_exceptions=True) return [result if not isinstance(result, Exception) else str(result) for result in results] + async def aprocess_html( self, url: str, @@ -258,7 +382,6 @@ class AsyncWebCrawler: verbose: bool, **kwargs, ) -> CrawlResult: - t = time.perf_counter() # Extract content from HTML try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" @@ -293,9 +416,9 @@ class AsyncWebCrawler: metadata = result.get("metadata", {}) if verbose: - print( - f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" - ) + print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + + if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): t1 = time.perf_counter() @@ -309,9 +432,9 @@ class AsyncWebCrawler: extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) if verbose: - print( - f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" - ) + print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + + screenshot = None if not screenshot else screenshot @@ -332,13 +455,15 @@ class AsyncWebCrawler: ) async def aclear_cache(self): - # await async_db_manager.aclear_db() + """Clear the cache database.""" await async_db_manager.cleanup() async def aflush_cache(self): + """Flush the cache database.""" await async_db_manager.aflush_db() async def aget_cache_size(self): + """Get the total number of cached items.""" return await async_db_manager.aget_total_count() diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py new file mode 100644 index 00000000..429eacc1 --- /dev/null +++ b/crawl4ai/cache_context.py @@ -0,0 +1,79 @@ +from enum import Enum + + +class CacheMode(Enum): + """ + Defines the caching behavior for web crawling operations. + + Modes: + - ENABLED: Normal caching behavior (read and write) + - DISABLED: No caching at all + - READ_ONLY: Only read from cache, don't write + - WRITE_ONLY: Only write to cache, don't read + - BYPASS: Bypass cache for this operation + """ + ENABLED = "enabled" + DISABLED = "disabled" + READ_ONLY = "read_only" + WRITE_ONLY = "write_only" + BYPASS = "bypass" + + +class CacheContext: + """ + Encapsulates cache-related decisions and URL handling. + + This class centralizes all cache-related logic and URL type checking, + making the caching behavior more predictable and maintainable. + """ + def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False): + self.url = url + self.cache_mode = cache_mode + self.always_bypass = always_bypass + self.is_cacheable = url.startswith(('http://', 'https://', 'file://')) + self.is_web_url = url.startswith(('http://', 'https://')) + self.is_local_file = url.startswith("file://") + self.is_raw_html = url.startswith("raw:") + self._url_display = url if not self.is_raw_html else "Raw HTML" + + def should_read(self) -> bool: + """Determines if cache should be read based on context.""" + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY] + + def should_write(self) -> bool: + """Determines if cache should be written based on context.""" + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY] + + @property + def display_url(self) -> str: + """Returns the URL in display format.""" + return self._url_display + + +def _legacy_to_cache_mode( + disable_cache: bool = False, + bypass_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False +) -> CacheMode: + """ + Converts legacy cache parameters to the new CacheMode enum. + + This is an internal function to help transition from the old boolean flags + to the new CacheMode system. + """ + if disable_cache: + return CacheMode.DISABLED + if bypass_cache: + return CacheMode.BYPASS + if no_cache_read and no_cache_write: + return CacheMode.DISABLED + if no_cache_read: + return CacheMode.WRITE_ONLY + if no_cache_write: + return CacheMode.READ_ONLY + return CacheMode.ENABLED diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 5bc284bf..6b1324dd 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -54,4 +54,5 @@ IMAGE_SCORE_THRESHOLD = 2 MAX_METRICS_HISTORY = 1000 -NEED_MIGRATION = True \ No newline at end of file +NEED_MIGRATION = True +SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index b43e8ee6..898f14da 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -48,8 +48,8 @@ class Crawl4AiTester: def test_docker_deployment(version="basic"): tester = Crawl4AiTester( base_url="http://localhost:11235" , - # base_url="https://crawl4ai-sby74.ondigitalocean.app", - api_token="test" + # base_url="https://api.crawl4ai.com" # just for example + # api_token="test" # just for example ) print(f"Testing Crawl4AI Docker {version} version") @@ -69,6 +69,7 @@ def test_docker_deployment(version="basic"): # Test cases based on version test_basic_crawl(tester) + test_basic_crawl(tester) test_basic_crawl_sync(tester) # if version in ["full", "transformer"]: diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9c57f57d..d67a8c30 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -71,12 +71,12 @@ async def use_proxy(): "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example." ) # Uncomment and modify the following lines to use a proxy - # async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: - # result = await crawler.arun( - # url="https://www.nbcnews.com/business", - # bypass_cache=True - # ) - # print(result.markdown[:500]) # Print first 500 characters + async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True + ) + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: diff --git a/requirements.txt b/requirements.txt index 74e8b3d6..e6294cc5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ beautifulsoup4~=4.12 tf-playwright-stealth~=1.0 xxhash~=3.4 rank-bm25~=0.2 -aiofiles~=24.0 \ No newline at end of file +aiofiles~=24.0 +colorama~=0.4 \ No newline at end of file diff --git a/tests/docker_example.py b/tests/docker_example.py new file mode 100644 index 00000000..658e80fd --- /dev/null +++ b/tests/docker_example.py @@ -0,0 +1,332 @@ +import requests +import json +import time +import sys +import base64 +import os +from typing import Dict, Any + +class Crawl4AiTester: + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): + self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} + + def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + # Submit crawl job + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") + task_id = response.json()["task_id"] + print(f"Task ID: {task_id}") + + # Poll for result + start_time = time.time() + while True: + if time.time() - start_time > timeout: + raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") + + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) + status = result.json() + + if status["status"] == "failed": + print("Task failed:", status.get("error")) + raise Exception(f"Task failed: {status.get('error')}") + + if status["status"] == "completed": + return status + + time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() + +def test_docker_deployment(version="basic"): + tester = Crawl4AiTester( + # base_url="http://localhost:11235" , + base_url="https://crawl4ai-sby74.ondigitalocean.app", + api_token="test" + ) + print(f"Testing Crawl4AI Docker {version} version") + + # Health check with timeout and retry + max_retries = 5 + for i in range(max_retries): + try: + health = requests.get(f"{tester.base_url}/health", timeout=10) + print("Health check:", health.json()) + break + except requests.exceptions.RequestException as e: + if i == max_retries - 1: + print(f"Failed to connect after {max_retries} attempts") + sys.exit(1) + print(f"Waiting for service to start (attempt {i+1}/{max_retries})...") + time.sleep(5) + + # Test cases based on version + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) + + # if version in ["full", "transformer"]: + # test_cosine_extraction(tester) + + # test_js_execution(tester) + # test_css_selector(tester) + # test_structured_extraction(tester) + # test_llm_extraction(tester) + # test_llm_with_ollama(tester) + # test_screenshot(tester) + + +def test_basic_crawl(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_and_wait(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + assert len(result["result"]["markdown"]) > 0 + +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + +def test_js_execution(tester: Crawl4AiTester): + print("\n=== Testing JS Execution ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "js_code": [ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ], + "wait_for": "article.tease-card:nth-child(10)", + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print(f"JS execution result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_css_selector(tester: Crawl4AiTester): + print("\n=== Testing CSS Selector ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 7, + "css_selector": ".wide-tease-item__description", + "crawler_params": { + "headless": True + }, + "extra": {"word_count_threshold": 10} + + } + + result = tester.submit_and_wait(request) + print(f"CSS selector result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_structured_extraction(tester: Crawl4AiTester): + print("\n=== Testing Structured Extraction ===") + schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text", + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text", + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text", + } + ], + } + + request = { + "urls": "https://www.coinbase.com/explore", + "priority": 9, + "extraction_config": { + "type": "json_css", + "params": { + "schema": schema + } + } + } + + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} items") + print("Sample item:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + assert len(extracted) > 0 + +def test_llm_extraction(tester: Crawl4AiTester): + print("\n=== Testing LLM Extraction ===") + schema = { + "type": "object", + "properties": { + "model_name": { + "type": "string", + "description": "Name of the OpenAI model." + }, + "input_fee": { + "type": "string", + "description": "Fee for input token for the OpenAI model." + }, + "output_fee": { + "type": "string", + "description": "Fee for output token for the OpenAI model." + } + }, + "required": ["model_name", "input_fee", "output_fee"] + } + + request = { + "urls": "https://openai.com/api/pricing", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": os.getenv("OPENAI_API_KEY"), + "schema": schema, + "extraction_type": "schema", + "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""" + } + }, + "crawler_params": {"word_count_threshold": 1} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} model pricing entries") + print("Sample entry:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"LLM extraction test failed (might be due to missing API key): {str(e)}") + +def test_llm_with_ollama(tester: Crawl4AiTester): + print("\n=== Testing LLM with Ollama ===") + schema = { + "type": "object", + "properties": { + "article_title": { + "type": "string", + "description": "The main title of the news article" + }, + "summary": { + "type": "string", + "description": "A brief summary of the article content" + }, + "main_topics": { + "type": "array", + "items": {"type": "string"}, + "description": "Main topics or themes discussed in the article" + } + } + } + + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "ollama/llama2", + "schema": schema, + "extraction_type": "schema", + "instruction": "Extract the main article information including title, summary, and main topics." + } + }, + "extra": {"word_count_threshold": 1}, + "crawler_params": {"verbose": True} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print("Extracted content:", json.dumps(extracted, indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"Ollama extraction test failed: {str(e)}") + +def test_cosine_extraction(tester: Crawl4AiTester): + print("\n=== Testing Cosine Extraction ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "cosine", + "params": { + "semantic_filter": "business finance economy", + "word_count_threshold": 10, + "max_dist": 0.2, + "top_k": 3 + } + } + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} text clusters") + print("First cluster tags:", extracted[0]["tags"]) + assert result["result"]["success"] + except Exception as e: + print(f"Cosine extraction test failed: {str(e)}") + +def test_screenshot(tester: Crawl4AiTester): + print("\n=== Testing Screenshot ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 5, + "screenshot": True, + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print("Screenshot captured:", bool(result["result"]["screenshot"])) + + if result["result"]["screenshot"]: + # Save screenshot + screenshot_data = base64.b64decode(result["result"]["screenshot"]) + with open("test_screenshot.jpg", "wb") as f: + f.write(screenshot_data) + print("Screenshot saved as test_screenshot.jpg") + + assert result["result"]["success"] + +if __name__ == "__main__": + version = sys.argv[1] if len(sys.argv) > 1 else "basic" + # version = "full" + test_docker_deployment(version) \ No newline at end of file From 3a524a3bdd3afdd58d64c336031e7687fdfe5631 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 16:00:39 +0800 Subject: [PATCH 26/50] fix(docs): remove unnecessary blank line in README for improved readability --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index a2806304..069c02b8 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,6 @@ Deploy your own instance of Crawl4AI with one click: [![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge) - > 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation. The deploy will: From 2a82455b3dd3427f3099e201c2d88fadcc0c78fc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 17:17:34 +0800 Subject: [PATCH 27/50] feat(crawl): implement direct crawl functionality and introduce CacheMode for improved caching control --- docs/examples/docker_example.py | 33 ++++++++++++-- docs/md_v2/basic/cache-modes.md | 79 +++++++++++++++++++++++++++++++++ main.py | 46 ++++++++++++++++++- mkdocs.yml | 1 + 4 files changed, 153 insertions(+), 6 deletions(-) create mode 100644 docs/md_v2/basic/cache-modes.md diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 898f14da..17ef9f04 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -9,7 +9,7 @@ from typing import Dict, Any class Crawl4AiTester: def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): self.base_url = base_url - self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: @@ -44,6 +44,16 @@ class Crawl4AiTester: raise TimeoutError("Task did not complete within server timeout") response.raise_for_status() return response.json() + + def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + """Directly crawl without using task queue""" + response = requests.post( + f"{self.base_url}/crawl_direct", + json=request_data, + headers=self.headers + ) + response.raise_for_status() + return response.json() def test_docker_deployment(version="basic"): tester = Crawl4AiTester( @@ -68,9 +78,10 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - test_basic_crawl(tester) - test_basic_crawl(tester) - test_basic_crawl_sync(tester) + # test_basic_crawl(tester) + # test_basic_crawl(tester) + # test_basic_crawl_sync(tester) + test_basic_crawl_direct(tester) # if version in ["full", "transformer"]: # test_cosine_extraction(tester) @@ -110,6 +121,20 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): assert result['result']['success'] assert len(result['result']['markdown']) > 0 +def test_basic_crawl_direct(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Direct) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + # "session_id": "test" + "cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only" + } + + result = tester.crawl_direct(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md new file mode 100644 index 00000000..04a4f218 --- /dev/null +++ b/docs/md_v2/basic/cache-modes.md @@ -0,0 +1,79 @@ +# Crawl4AI Cache System and Migration Guide + +## Overview +Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. + +## Old vs New Approach + +### Old Way (Deprecated) +The old system used multiple boolean flags: +- `bypass_cache`: Skip cache entirely +- `disable_cache`: Disable all caching +- `no_cache_read`: Don't read from cache +- `no_cache_write`: Don't write to cache + +### New Way (Recommended) +The new system uses a single `CacheMode` enum: +- `CacheMode.ENABLED`: Normal caching (read/write) +- `CacheMode.DISABLED`: No caching at all +- `CacheMode.READ_ONLY`: Only read from cache +- `CacheMode.WRITE_ONLY`: Only write to cache +- `CacheMode.BYPASS`: Skip cache for this operation + +## Migration Example + +### Old Code (Deprecated) +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True # Old way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### New Code (Recommended) +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + cache_mode=CacheMode.BYPASS # New way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Common Migration Patterns + +Old Flag | New Mode +---------|---------- +`bypass_cache=True` | `cache_mode=CacheMode.BYPASS` +`disable_cache=True` | `cache_mode=CacheMode.DISABLED` +`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` +`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` + +## Suppressing Deprecation Warnings +If you need time to migrate, you can temporarily suppress deprecation warnings: +```python +# In your config.py +SHOW_DEPRECATION_WARNINGS = False +``` diff --git a/main.py b/main.py index 41788d61..ee5f7fc6 100644 --- a/main.py +++ b/main.py @@ -25,7 +25,7 @@ import logging from enum import Enum from dataclasses import dataclass import json -from crawl4ai import AsyncWebCrawler, CrawlResult +from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, @@ -66,6 +66,7 @@ class CrawlRequest(BaseModel): magic: bool = False extra: Optional[Dict[str, Any]] = {} session_id: Optional[str] = None + cache_mode: Optional[CacheMode] = None @dataclass class TaskInfo: @@ -329,7 +330,7 @@ app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages" # API token security security = HTTPBearer() -CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): if not CRAWL4AI_API_TOKEN: @@ -419,6 +420,47 @@ async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: # If we get here, task didn't complete within timeout raise HTTPException(status_code=408, detail="Task timed out") +@app.post("/crawl_direct", dependencies=[Depends(verify_token)]) +async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: + try: + crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) + extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config) + + try: + if isinstance(request.urls, list): + results = await crawler.arun_many( + urls=[str(url) for url in request.urls], + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"results": [result.dict() for result in results]} + else: + result = await crawler.arun( + url=str(request.urls), + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"result": result.dict()} + finally: + await crawler_service.crawler_pool.release(crawler) + except Exception as e: + logger.error(f"Error in direct crawl: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + @app.get("/health") async def health_check(): available_slots = await crawler_service.resource_monitor.get_available_slots() diff --git a/mkdocs.yml b/mkdocs.yml index b09cb9eb..1b26b9df 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,6 +17,7 @@ nav: - 'Browser Configuration': 'basic/browser-config.md' - 'Page Interaction': 'basic/page-interaction.md' - 'Content Selection': 'basic/content-selection.md' + - 'Cache Modes': 'basic/cache-modes.md' - Advanced: - 'Content Processing': 'advanced/content-processing.md' From f9fe6f89feafeba175dc35da64ca5f6883839473 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 18:09:33 +0800 Subject: [PATCH 28/50] feat(database): implement version management and migration checks during initialization --- crawl4ai/async_database.py | 39 +++++++++++++++++++++++++-- crawl4ai/version_manager.py | 30 +++++++++++++++++++++ docs/md_v2/basic/installation.md | 45 ++++++++++++++++++++++++++++++++ setup.py | 2 +- 4 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 crawl4ai/version_manager.py diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index f97d8131..7809dfe1 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -11,6 +11,7 @@ from .models import CrawlResult import xxhash import aiofiles from .config import NEED_MIGRATION +from .version_manager import VersionManager # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -28,22 +29,49 @@ class AsyncDatabaseManager: self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.pool_lock = asyncio.Lock() self.connection_semaphore = asyncio.Semaphore(pool_size) + self._initialized = False + self.version_manager = VersionManager() + async def initialize(self): """Initialize the database and connection pool""" try: logger.info("Initializing database...") + # Ensure the database file exists + os.makedirs(os.path.dirname(self.db_path), exist_ok=True) + + # Check if version update is needed + needs_update = self.version_manager.needs_update() + + # Always ensure base table exists await self.ainit_db() - if NEED_MIGRATION: + + # Verify the table exists + async def verify_table(db): + async with db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" + ) as cursor: + result = await cursor.fetchone() + if not result: + raise Exception("crawled_data table was not created") + + await self.execute_with_retry(verify_table) + + # If version changed or fresh install, run updates + if needs_update: + logger.info("New version detected, running updates...") await self.update_db_schema() from .migrations import run_migration # Import here to avoid circular imports await run_migration() - logger.info("Database initialization and migration completed successfully") + self.version_manager.update_version() # Update stored version after successful migration + logger.info("Version update completed successfully") else: logger.info("Database initialization completed successfully") + except Exception as e: logger.error(f"Database initialization error: {e}") logger.info("Database will be initialized on first use") + raise async def cleanup(self): """Cleanup connections when shutting down""" @@ -55,6 +83,12 @@ class AsyncDatabaseManager: @asynccontextmanager async def get_connection(self): """Connection pool manager""" + if not self._initialized: + async with self.pool_lock: # Prevent multiple simultaneous initializations + if not self._initialized: # Double-check after acquiring lock + await self.initialize() + self._initialized = True + async with self.connection_semaphore: task_id = id(asyncio.current_task()) try: @@ -79,6 +113,7 @@ class AsyncDatabaseManager: await self.connection_pool[task_id].close() del self.connection_pool[task_id] + async def execute_with_retry(self, operation, *args): """Execute database operations with retry logic""" for attempt in range(self.max_retries): diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py new file mode 100644 index 00000000..07e0c0e9 --- /dev/null +++ b/crawl4ai/version_manager.py @@ -0,0 +1,30 @@ +# version_manager.py +import os +from pathlib import Path +from packaging import version +from . import __version__ + +class VersionManager: + def __init__(self): + self.home_dir = Path.home() / ".crawl4ai" + self.version_file = self.home_dir / "version.txt" + + def get_installed_version(self): + """Get the version recorded in home directory""" + if not self.version_file.exists(): + return None + try: + return version.parse(self.version_file.read_text().strip()) + except: + return None + + def update_version(self): + """Update the version file to current library version""" + self.version_file.write_text(__version__) + + def needs_update(self): + """Check if database needs update based on version""" + installed = self.get_installed_version() + current = version.parse(__version__) + return installed is None or installed < current + diff --git a/docs/md_v2/basic/installation.md b/docs/md_v2/basic/installation.md index a4a60857..de8aeafa 100644 --- a/docs/md_v2/basic/installation.md +++ b/docs/md_v2/basic/installation.md @@ -58,6 +58,51 @@ crawl4ai-download-models This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. +## Playwright Installation Note for Ubuntu + +If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies: + +```bash +sudo apt-get install -y \ + libwoff1 \ + libopus0 \ + libwebp7 \ + libwebpdemux2 \ + libenchant-2-2 \ + libgudev-1.0-0 \ + libsecret-1-0 \ + libhyphen0 \ + libgdk-pixbuf2.0-0 \ + libegl1 \ + libnotify4 \ + libxslt1.1 \ + libevent-2.1-7 \ + libgles2 \ + libxcomposite1 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libepoxy0 \ + libgtk-3-0 \ + libharfbuzz-icu0 \ + libgstreamer-gl1.0-0 \ + libgstreamer-plugins-bad1.0-0 \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + libxt6 \ + libxaw7 \ + xvfb \ + fonts-noto-color-emoji \ + libfontconfig \ + libfreetype6 \ + xfonts-cyrillic \ + xfonts-scalable \ + fonts-liberation \ + fonts-ipafont-gothic \ + fonts-wqy-zenhei \ + fonts-tlwg-loma-otf \ + fonts-freefont-ttf +``` + ## Option 2: Using Docker (Coming Soon) Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. diff --git a/setup.py b/setup.py index d8ad2cd3..bbc03026 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ class PostInstallCommand(install): def run(self): install.run(self) install_playwright() - run_migration() + # run_migration() setup( name="Crawl4AI", From a59c107b237ccdab1036f08123421f2645a628f3 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 18:42:43 +0800 Subject: [PATCH 29/50] Update changelog for 0.3.74 --- CHANGELOG.md | 220 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 201 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e82fa6a2..8e5cc91a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,194 @@ # Changelog +## [0.3.74] November 17, 2024 -## Version 0.3.74, Major Changes +This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information. + +### 1. File Download Processing + +- Users can now specify download folders using the `downloads_path` parameter in the `AsyncWebCrawler` constructor or the `arun` method. If not specified, downloads are saved to a "downloads" folder within the `.crawl4ai` directory. +- File download tracking is integrated into the `CrawlResult` object. Successfully downloaded files are listed in the `downloaded_files` attribute, providing their paths. +- Added `accept_downloads` parameter to the crawler strategies (defaults to `False`). If set to True you can add JS code and `wait_for` parameter for file download. + +**Example:** + +```python +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler + +async def download_example(): + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { downloadLink.click(); } + """, + wait_for=5 # To ensure download has started + ) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + +asyncio.run(download_example()) + +``` + +### 2. Refined Content Filtering + +- Introduced the `RelevanceContentFilter` strategy (and its implementation `BM25ContentFilter`) for extracting relevant content from web pages, replacing Fit Markdown and other content cleaning strategy. This new strategy leverages the BM25 algorithm to identify chunks of text relevant to the page's title, description, keywords, or a user-provided query. +- The `fit_markdown` flag in the content scraper is used to filter content based on title, meta description, and keywords. + +**Example:** + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + print(result.extracted_content) # Or result.fit_markdown for the markdown version + print(result.fit_html) # Or result.fit_html to show HTML with only the filtered content + +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) +``` + +### 3. Raw HTML and Local File Support + +- Added support for crawling local files and raw HTML content directly. +- Use the `file://` prefix for local file paths. +- Use the `raw:` prefix for raw HTML strings. + +**Example:** + +```python +async def crawl_local_or_raw(crawler, content, content_type): + prefix = "file://" if content_type == "local" else "raw:" + url = f"{prefix}{content}" + result = await crawler.arun(url=url) + if result.success: + print(f"Markdown Content from {content_type.title()} Source:") + print(result.markdown) + +# Example usage with local file and raw HTML +async def main(): + async with AsyncWebCrawler() as crawler: + # Local File + await crawl_local_or_raw( + crawler, os.path.abspath('tests/async/sample_wikipedia.html'), "local" + ) + # Raw HTML + await crawl_raw_html(crawler, "

    Raw Test

    This is raw HTML.

    ") + + +asyncio.run(main()) +``` + +### 4. Browser Management + +- New asynchronous crawler strategy implemented using Playwright. +- `ManagedBrowser` class introduced for improved browser session handling, offering features like persistent browser sessions between requests (using `session_id` parameter) and browser process monitoring. +- Updated to tf-playwright-stealth for enhanced stealth capabilities. +- Added `use_managed_browser`, `use_persistent_context`, and `chrome_channel` parameters to AsyncPlaywrightCrawlerStrategy. + + +**Example:** +```python +async def browser_management_demo(): + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "user-data-dir") + os.makedirs(user_data_dir, exist_ok=True) # Ensure directory exists + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + use_persistent_context=True, + verbose=True + ) as crawler: + result1 = await crawler.arun( + url="https://example.com", session_id="my_session" + ) + result2 = await crawler.arun( + url="https://example.com/anotherpage", session_id="my_session" + ) + +asyncio.run(browser_management_demo()) +``` + + +### 5. API Server & Cache Improvements + +- Added CORS support to API server. +- Implemented static file serving. +- Enhanced root redirect functionality. +- Cache database updated to store response headers and downloaded files information. It utilizes a file system approach to manage large content efficiently. +- New, more efficient caching database built using xxhash and file system approach. +- Introduced `CacheMode` enum (`ENABLED`, `DISABLED`, `READ_ONLY`, `WRITE_ONLY`, `BYPASS`) and `always_bypass_cache` parameter in AsyncWebCrawler for fine-grained cache control. This replaces `bypass_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. + + +### 🗑️ Removals + +- Removed deprecated: `crawl4ai/content_cleaning_strategy.py`. +- Removed internal class ContentCleaningStrategy +- Removed legacy cache control flags: `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. These have been superseded by `cache_mode`. + + +### ⚙️ Other Changes + +- Moved version file to `crawl4ai/__version__.py`. +- Added `crawl4ai/cache_context.py`. +- Added `crawl4ai/version_manager.py`. +- Added `crawl4ai/migrations.py`. +- Added `crawl4ai-migrate` entry point. +- Added config `NEED_MIGRATION` and `SHOW_DEPRECATION_WARNINGS`. +- API server now requires an API token for authentication, configurable with the `CRAWL4AI_API_TOKEN` environment variable. This enhances API security. +- Added synchronous crawl endpoint `/crawl_sync` for immediate result retrieval, and direct crawl endpoint `/crawl_direct` bypassing the task queue. + + +### ⚠️ Deprecation Notices + +- The synchronous version of `WebCrawler` is being phased out. While still available via `crawl4ai[sync]`, it will eventually be removed. Transition to `AsyncWebCrawler` is strongly recommended. Boolean cache control flags in `arun` are also deprecated, migrate to using the `cache_mode` parameter. See examples in the "New Features" section above for correct usage. + + +### 🐛 Bug Fixes + +- Resolved issue with browser context closing unexpectedly in Docker. This significantly improves stability, particularly within containerized environments. +- Fixed memory leaks associated with incorrect asynchronous cleanup by removing the `__del__` method and ensuring the browser context is closed explicitly using context managers. +- Improved error handling in `WebScrapingStrategy`. More detailed error messages and suggestions for debugging will minimize frustration when running into unexpected issues. +- Fixed issue with incorrect text parsing in specific HTML structures. + + +### Example of migrating to the new CacheMode: + +**Old way:** + +```python +crawler = AsyncWebCrawler(always_by_pass_cache=True) +result = await crawler.arun(url="https://example.com", bypass_cache=True) +``` + +**New way:** + +```python +from crawl4ai import CacheMode + +crawler = AsyncWebCrawler(always_bypass_cache=True) +result = await crawler.arun(url="https://example.com", cache_mode=CacheMode.BYPASS) +``` + + +## [0.3.74] - November 13, 2024 1. **File Download Processing** (Nov 14, 2024) - Added capability for users to specify download folders @@ -30,14 +217,9 @@ - Implemented static file serving - Enhanced root redirect functionality -# [0.3.74] November 14, 2024 - -- In this commit, the library is updated to process file downloads. Users can now specify a download folder and trigger the download process via JavaScript or other means, with all files being saved. The list of downloaded files will also be added to the crowd result object. -- Another thing this commit introduces is the concept of the Relevance Content Filter. This is an improvement over Fit Markdown. This class of strategies aims to extract the main content from a given page - the part that really matters and is useful to be processed. One strategy has been created using the BM25 algorithm, which finds chunks of text from the web page relevant to its title, descriptions, and keywords, or supports a given user query and matches them. The result is then returned to the main engine to be converted to Markdown. Plans include adding approaches using language models as well. -- The cache database was updated to hold information about response headers and downloaded files. -# Changelog - November 13, 2024 +## [0.3.731] - November 13, 2024 ### Added - Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://') @@ -137,7 +319,7 @@ - Modified database connection management approach - Updated API response structure for better consistency -## Migration Guide +### Migration Guide When upgrading to v0.3.73, be aware of the following changes: 1. Docker Deployment: @@ -159,7 +341,7 @@ When upgrading to v0.3.73, be aware of the following changes: - Follow recommended fixes for any identified problems -## [2024-11-04 - 13:21:42] Comprehensive Update of Crawl4AI Features and Dependencies +## [v0.3.73] - 2024-11-04 This commit introduces several key enhancements, including improved error handling and robust database operations in `async_database.py`, which now features a connection pool and retry logic for better reliability. Updates to the README.md provide clearer instructions and a better user experience with links to documentation sections. The `.gitignore` file has been refined to include additional directories, while the async web crawler now utilizes a managed browser for more efficient crawling. Furthermore, multiple dependency updates and introduction of the `CustomHTML2Text` class enhance text extraction capabilities. ## [v0.3.73] - 2024-10-24 @@ -405,43 +587,43 @@ These updates aim to provide more flexibility in text processing, improve perfor - Allows retrieval of content after a specified delay, useful for dynamically loaded content. - **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. -## Improvements and Optimizations +### Improvements and Optimizations -### 1. AsyncWebCrawler Enhancements +#### 1. AsyncWebCrawler Enhancements - **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy. - Allows for more customized setups. -### 2. Image Processing Optimization +#### 2. Image Processing Optimization - Enhanced image handling in WebScrapingStrategy. - Added filtering for small, invisible, or irrelevant images. - Improved image scoring system for better content relevance. - Implemented JavaScript-based image dimension updating for more accurate representation. -### 3. Database Schema Auto-updates +#### 3. Database Schema Auto-updates - Automatic database schema updates ensure compatibility with the latest version. -### 4. Enhanced Error Handling and Logging +#### 4. Enhanced Error Handling and Logging - Improved error messages and logging for easier debugging. -### 5. Content Extraction Refinements +#### 5. Content Extraction Refinements - Refined HTML sanitization process. - Improved handling of base64 encoded images. - Enhanced Markdown conversion process. - Optimized content extraction algorithms. -### 6. Utility Function Enhancements +#### 6. Utility Function Enhancements - `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers. -## Bug Fixes +### Bug Fixes - Fixed an issue where image tags were being prematurely removed during content extraction. -## Examples and Documentation +### Examples and Documentation - Updated `quickstart_async.py` with examples of: - Using custom headers in LLM extraction. - Different LLM provider usage (OpenAI, Hugging Face, Ollama). - Custom browser type usage. -## Developer Notes +### Developer Notes - Refactored code for better maintainability, flexibility, and performance. - Enhanced type hinting throughout the codebase for improved development experience. - Expanded error handling for more robust operation. From df63a4060673b2d5647abdce07810e29cf20e739 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 19:44:45 +0800 Subject: [PATCH 30/50] feat(docs): update examples and documentation to replace bypass_cache with cache_mode for improved clarity --- README.md | 24 ++- crawl4ai/async_webcrawler.py | 37 +++-- crawl4ai/content_scrapping_strategy.py | 15 +- docs/examples/v0.3.74.overview.py | 2 +- docs/md_v2/advanced/managed_browser.md | 84 ++++++++++ .../advanced/session-management-advanced.md | 10 +- docs/md_v2/advanced/session-management.md | 2 +- docs/md_v2/api/arun.md | 42 +++-- docs/md_v2/api/crawl-result.md | 1 + docs/md_v2/api/parameters.md | 3 +- docs/md_v2/basic/content_filtering.md | 84 ++++++++++ docs/md_v2/basic/file-download.md | 148 ++++++++++++++++++ docs/md_v2/basic/quickstart.md | 22 +-- docs/md_v2/basic/simple-crawling.md | 10 +- .../episode_11_2_Extraction_Strategies_LLM.md | 4 +- ...isode_11_3_Extraction_Strategies_Cosine.md | 4 +- docs/md_v2/tutorial/tutorial.md | 10 +- 17 files changed, 422 insertions(+), 80 deletions(-) create mode 100644 docs/md_v2/advanced/managed_browser.md create mode 100644 docs/md_v2/basic/content_filtering.md create mode 100644 docs/md_v2/basic/file-download.md diff --git a/README.md b/README.md index 069c02b8..9c3796cd 100644 --- a/README.md +++ b/README.md @@ -11,21 +11,19 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -## 🌟 Meet the Crawl4AI Assistant: Your Copilot for Crawling +## New in 0.3.74 ✨ -Use the [Crawl4AI GPT Assistant](https://tinyurl.com/crawl4ai-gpt) as your AI-powered copilot! With this assistant, you can: +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 📥 **Download Mastery:** Control downloads, specify folders, and track files within the `CrawlResult` object. +- 🔎 **Relevance Filtering:** Extract the most important content with the new `RelevanceContentFilter` and BM25 algorithm. Control filtering with the `fit_markdown` flag. +- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. +- 🤖 **Browser Boss:** Manage browser sessions with persistent contexts, process monitoring, and tf-playwright-stealth integration. Configure using `use_managed_browser`, `user_data_dir`, and `use_persistent_context` parameters. +- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. +- 🔒 **API Security:** Protect your API server with token-based authentication using the `CRAWL4AI_API_TOKEN` environment variable. +- 🔄 **Synchronous & Direct Crawling:** Get immediate results with `/crawl_sync` or bypass the task queue with `/crawl_direct`. +- 🛠️ **Database Migration:** A new `crawl4ai-migrate` command ensures smooth upgrades and data integrity between versions. +- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. -- 🧑‍💻 Generate code for complex crawling and extraction tasks -- 💡 Get tailored support and examples -- 📘 Learn Crawl4AI faster with step-by-step guidance - -## New in 0.3.73 ✨ - -- 🐳 Docker Ready: Full API server with seamless deployment & scaling -- 🎯 Browser Takeover: Use your own browser with cookies & history intact (CDP support) -- 📝 Mockdown+: Enhanced tag preservation & content extraction -- ⚡️ Parallel Power: Supercharged multi-URL crawling performance -- 🌟 And many more exciting updates... ## Try it Now! diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d554576d..d22e3b1f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -10,14 +10,14 @@ import asyncio from .models import CrawlResult from .async_database import async_db_manager from .chunking_strategy import * +from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy from .config import ( MIN_WORD_THRESHOLD, - IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, - SHOW_DEPRECATION_WARNINGS # New import + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ) from .utils import ( sanitize_input_encode, @@ -49,7 +49,7 @@ class AsyncWebCrawler: ) To disable deprecation warnings: - Set SHOW_DEPRECATION_WARNINGS = False in config.py + Pass warning=False to suppress the warning. """ def __init__( @@ -85,11 +85,11 @@ class AsyncWebCrawler: # Handle deprecated parameter if always_by_pass_cache is not None: - if SHOW_DEPRECATION_WARNINGS: + if kwargs.get("warning", True): warnings.warn( "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " "Use 'always_bypass_cache' instead. " - "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + "Pass warning=False to suppress this warning.", DeprecationWarning, stacklevel=2 ) @@ -126,6 +126,7 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, cache_mode: Optional[CacheMode] = None, # Deprecated parameters bypass_cache: bool = False, @@ -172,7 +173,7 @@ class AsyncWebCrawler: try: # Handle deprecated parameters if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - if SHOW_DEPRECATION_WARNINGS: + if kwargs.get("warning", True): warnings.warn( "Cache control boolean flags are deprecated and will be removed in version X.X.X. " "Use 'cache_mode' parameter instead. Examples:\n" @@ -180,7 +181,7 @@ class AsyncWebCrawler: "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" - "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", + "Pass warning=False to suppress this warning.", DeprecationWarning, stacklevel=2 ) @@ -257,6 +258,7 @@ class AsyncWebCrawler: word_count_threshold=word_count_threshold, extraction_strategy=extraction_strategy, chunking_strategy=chunking_strategy, + content_filter=content_filter, css_selector=css_selector, screenshot=screenshot_data, verbose=verbose, @@ -308,6 +310,7 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, cache_mode: Optional[CacheMode] = None, # Deprecated parameters bypass_cache: bool = False, @@ -335,14 +338,15 @@ class AsyncWebCrawler: Returns: List[CrawlResult]: Results for each URL """ - if bypass_cache and SHOW_DEPRECATION_WARNINGS: - warnings.warn( - "'bypass_cache' is deprecated and will be removed in version X.X.X. " - "Use 'cache_mode=CacheMode.BYPASS' instead. " - "Set SHOW_DEPRECATION_WARNINGS=False in config.py to suppress this warning.", - DeprecationWarning, - stacklevel=2 - ) + if bypass_cache: + if kwargs.get("warning", True): + warnings.warn( + "'bypass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'cache_mode=CacheMode.BYPASS' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) if cache_mode is None: cache_mode = CacheMode.BYPASS @@ -356,6 +360,7 @@ class AsyncWebCrawler: word_count_threshold=word_count_threshold, extraction_strategy=extraction_strategy, chunking_strategy=chunking_strategy, + content_filter=content_filter, cache_mode=cache_mode, css_selector=css_selector, screenshot=screenshot, @@ -377,6 +382,7 @@ class AsyncWebCrawler: word_count_threshold: int, extraction_strategy: ExtractionStrategy, chunking_strategy: ChunkingStrategy, + content_filter: RelevantContentFilter, css_selector: str, screenshot: str, verbose: bool, @@ -397,6 +403,7 @@ class AsyncWebCrawler: image_description_min_word_threshold=kwargs.get( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), + content_filter = content_filter, **kwargs, ) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 9c81638c..d16b0680 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -532,14 +532,13 @@ class WebScrapingStrategy(ContentScrapingStrategy): fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - if kwargs.get('fit_markdown', False): - # cleaner = ContentCleaningStrategy() - # fit_html = cleaner.clean(cleaned_html) - # fit_markdown = h.handle(fit_html) - content_filter = BM25ContentFilter( - user_query= kwargs.get('fit_markdown_user_query', None), - bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) + if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): + content_filter = kwargs.get('content_filter', None) + if not content_filter: + content_filter = BM25ContentFilter( + user_query= kwargs.get('fit_markdown_user_query', None), + bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) fit_html = content_filter.filter_content(html) fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) fit_markdown = h.handle(fit_html) diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py index 579d05dd..ec3a7d73 100644 --- a/docs/examples/v0.3.74.overview.py +++ b/docs/examples/v0.3.74.overview.py @@ -56,7 +56,7 @@ async def content_filtering_example(): result = await crawler.arun( url="https://openai.com/blog", - extraction_strategy=content_filter + content_filter=content_filter ) print(f"Filtered content: {result.extracted_content}") diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md new file mode 100644 index 00000000..80d6fc1a --- /dev/null +++ b/docs/md_v2/advanced/managed_browser.md @@ -0,0 +1,84 @@ +# Content Filtering in Crawl4AI + +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. + +## Relevance Content Filter + +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + +## BM25 Algorithm + +The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. + +### Usage + +To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query=None): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + if result.success: + print(f"Filtered Content (JSON):\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object + print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. + else: + print("Error:", result.error_message) + +# Example usage: +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. + +``` + +### Parameters + +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. +- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. + + +## Fit Markdown Flag + +Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. + + +## Custom Content Filtering Strategies + +You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. + +```python +from crawl4ai.content_filter_strategy import RelevantContentFilter +from bs4 import BeautifulSoup, Tag +from typing import List + +class MyCustomFilter(RelevantContentFilter): + def filter_content(self, html: str) -> List[str]: + soup = BeautifulSoup(html, 'lxml') + # Implement custom filtering logic here + # Example: extract all paragraphs within divs with class "article-body" + filtered_paragraphs = [] + for tag in soup.select("div.article-body p"): + if isinstance(tag, Tag): + filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. + return filtered_paragraphs + + + +async def custom_filter_demo(url: str): + async with AsyncWebCrawler() as crawler: + custom_filter = MyCustomFilter() + result = await crawler.arun(url, extraction_strategy=custom_filter) + if result.success: + print(result.extracted_content) + +``` + +This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. + +## Conclusion + +Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md index f8c81da2..908828f7 100644 --- a/docs/md_v2/advanced/session-management-advanced.md +++ b/docs/md_v2/advanced/session-management-advanced.md @@ -30,7 +30,7 @@ Let's start with a basic example of session-based crawling: ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def basic_session_crawl(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -43,7 +43,7 @@ async def basic_session_crawl(): session_id=session_id, js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, css_selector=".content-item", - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items") @@ -102,7 +102,7 @@ async def advanced_session_crawl_with_hooks(): session_id=session_id, css_selector="li.commit-item", js_code=js_next_page if page > 0 else None, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, js_only=page > 0 ) @@ -174,7 +174,7 @@ async def integrated_js_and_wait_crawl(): extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) commits = json.loads(result.extracted_content) @@ -241,7 +241,7 @@ async def wait_for_parameter_crawl(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) commits = json.loads(result.extracted_content) diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index c38ed852..eae4cf7b 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -75,7 +75,7 @@ async def crawl_dynamic_content(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) if result.success: diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md index 9ef73aef..509991e5 100644 --- a/docs/md_v2/api/arun.md +++ b/docs/md_v2/api/arun.md @@ -8,11 +8,26 @@ The following parameters can be passed to the `arun()` method. They are organize await crawler.arun( url="https://example.com", # Required: URL to crawl verbose=True, # Enable detailed logging - bypass_cache=False, # Skip cache for this request + cache_mode=CacheMode.ENABLED, # Control cache behavior warmup=True # Whether to run warmup check ) ``` +## Cache Control + +```python +from crawl4ai import CacheMode + +await crawler.arun( + cache_mode=CacheMode.ENABLED, # Normal caching (read/write) + # Other cache modes: + # cache_mode=CacheMode.DISABLED # No caching at all + # cache_mode=CacheMode.READ_ONLY # Only read from cache + # cache_mode=CacheMode.WRITE_ONLY # Only write to cache + # cache_mode=CacheMode.BYPASS # Skip cache for this operation +) +``` + ## Content Processing Parameters ### Text Processing @@ -162,14 +177,13 @@ await crawler.arun( ## Parameter Interactions and Notes -1. **Magic Mode Combinations** +1. **Cache and Performance Setup** ```python - # Full anti-detection setup + # Optimal caching for repeated crawls await crawler.arun( - magic=True, - headless=False, - simulate_user=True, - override_navigator=True + cache_mode=CacheMode.ENABLED, + word_count_threshold=10, + process_iframes=False ) ``` @@ -179,7 +193,8 @@ await crawler.arun( await crawler.arun( js_code="window.scrollTo(0, document.body.scrollHeight);", wait_for="css:.lazy-content", - delay_before_return_html=2.0 + delay_before_return_html=2.0, + cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load ) ``` @@ -192,7 +207,8 @@ await crawler.arun( extraction_strategy=my_strategy, chunking_strategy=my_chunking, process_iframes=True, - remove_overlay_elements=True + remove_overlay_elements=True, + cache_mode=CacheMode.ENABLED ) ``` @@ -201,7 +217,7 @@ await crawler.arun( 1. **Performance Optimization** ```python await crawler.arun( - bypass_cache=False, # Use cache when possible + cache_mode=CacheMode.ENABLED, # Use full caching word_count_threshold=10, # Filter out noise process_iframes=False # Skip iframes if not needed ) @@ -212,7 +228,8 @@ await crawler.arun( await crawler.arun( magic=True, # Enable anti-detection delay_before_return_html=1.0, # Wait for dynamic content - page_timeout=60000 # Longer timeout for slow pages + page_timeout=60000, # Longer timeout for slow pages + cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl ) ``` @@ -221,6 +238,7 @@ await crawler.arun( await crawler.arun( remove_overlay_elements=True, # Remove popups excluded_tags=['nav', 'aside'],# Remove unnecessary elements - keep_data_attributes=False # Remove data attributes + keep_data_attributes=False, # Remove data attributes + cache_mode=CacheMode.ENABLED # Use cache for faster processing ) ``` \ No newline at end of file diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md index 06998af3..7e3bda98 100644 --- a/docs/md_v2/api/crawl-result.md +++ b/docs/md_v2/api/crawl-result.md @@ -20,6 +20,7 @@ class CrawlResult(BaseModel): fit_html: Optional[str] = None # Most relevant HTML content markdown: Optional[str] = None # HTML converted to markdown fit_markdown: Optional[str] = None # Most relevant markdown content + downloaded_files: Optional[List[str]] = None # Downloaded files # Extracted Data extracted_content: Optional[str] = None # Content from extraction strategy diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 6c7960d2..c1c4d2ea 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -32,4 +32,5 @@ | async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request | | async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse | | async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content | -| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | \ No newline at end of file +| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | +| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request | \ No newline at end of file diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md new file mode 100644 index 00000000..9506c075 --- /dev/null +++ b/docs/md_v2/basic/content_filtering.md @@ -0,0 +1,84 @@ +# Content Filtering in Crawl4AI + +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. + +## Relevance Content Filter + +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + +## BM25 Algorithm + +The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. + +### Usage + +To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query=None): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + if result.success: + print(f"Filtered Content (JSON):\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object + print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. + else: + print("Error:", result.error_message) + +# Example usage: +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. + +``` + +### Parameters + +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. +- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. + + +## Fit Markdown Flag + +Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. + + +## Custom Content Filtering Strategies + +You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. + +```python +from crawl4ai.content_filter_strategy import RelevantContentFilter +from bs4 import BeautifulSoup, Tag +from typing import List + +class MyCustomFilter(RelevantContentFilter): + def filter_content(self, html: str) -> List[str]: + soup = BeautifulSoup(html, 'lxml') + # Implement custom filtering logic here + # Example: extract all paragraphs within divs with class "article-body" + filtered_paragraphs = [] + for tag in soup.select("div.article-body p"): + if isinstance(tag, Tag): + filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. + return filtered_paragraphs + + + +async def custom_filter_demo(url: str): + async with AsyncWebCrawler() as crawler: + custom_filter = MyCustomFilter() + result = await crawler.arun(url, content_filter=custom_filter) + if result.success: + print(result.extracted_content) + +``` + +This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. + +## Conclusion + +Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. diff --git a/docs/md_v2/basic/file-download.md b/docs/md_v2/basic/file-download.md new file mode 100644 index 00000000..c37e8812 --- /dev/null +++ b/docs/md_v2/basic/file-download.md @@ -0,0 +1,148 @@ +# Download Handling in Crawl4AI + +This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. + +## Enabling Downloads + +By default, Crawl4AI does not download files. To enable downloads, set the `accept_downloads` parameter to `True` in either the `AsyncWebCrawler` constructor or the `arun` method. + +```python +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler(accept_downloads=True) as crawler: # Globally enable downloads + # ... your crawling logic ... + +asyncio.run(main()) +``` + +Or, enable it for a specific crawl: + +```python +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="...", accept_downloads=True) + # ... +``` + +## Specifying Download Location + +You can specify the download directory using the `downloads_path` parameter. If not provided, Crawl4AI creates a "downloads" directory inside the `.crawl4ai` folder in your home directory. + +```python +import os +from pathlib import Path + +# ... inside your crawl function: + +downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path +os.makedirs(downloads_path, exist_ok=True) + +result = await crawler.arun(url="...", downloads_path=downloads_path, accept_downloads=True) + +# ... +``` + +If you are setting it globally, provide the path to the AsyncWebCrawler: +```python +async def crawl_with_downloads(url: str, download_path: str): + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=download_path, # or set it on arun + verbose=True + ) as crawler: + result = await crawler.arun(url=url) # you still need to enable downloads per call. + # ... +``` + + + +## Triggering Downloads + +Downloads are typically triggered by user interactions on a web page (e.g., clicking a download button). You can simulate these actions with the `js_code` parameter, injecting JavaScript code to be executed within the browser context. The `wait_for` parameter might also be crucial to allowing sufficient time for downloads to initiate before the crawler proceeds. + +```python +result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Find and click the first Windows installer link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + downloadLink.click(); + } + """, + wait_for=5 # Wait for 5 seconds for the download to start +) +``` + +## Accessing Downloaded Files + +Downloaded file paths are stored in the `downloaded_files` attribute of the returned `CrawlResult` object. This is a list of strings, with each string representing the absolute path to a downloaded file. + +```python +if result.downloaded_files: + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + # Perform operations with downloaded files, e.g., check file size + file_size = os.path.getsize(file_path) + print(f"- File size: {file_size} bytes") +else: + print("No files downloaded.") +``` + + +## Example: Downloading Multiple Files + +```python +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler + +async def download_multiple_files(url: str, download_path: str): + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=download_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url=url, + js_code=""" + // Trigger multiple downloads (example) + const downloadLinks = document.querySelectorAll('a[download]'); // Or a more specific selector + for (const link of downloadLinks) { + link.click(); + await new Promise(r => setTimeout(r, 2000)); // Add a small delay between clicks if needed + } + """, + wait_for=10 # Adjust the timeout to match the expected time for all downloads to start + ) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + else: + print("No files downloaded.") + + +# Example usage +download_path = os.path.join(Path.home(), ".crawl4ai", "downloads") +os.makedirs(download_path, exist_ok=True) # Create directory if it doesn't exist + + +asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path)) +``` + +## Important Considerations + +- **Browser Context:** Downloads are managed within the browser context. Ensure your `js_code` correctly targets the download triggers on the specific web page. +- **Waiting:** Use `wait_for` to manage the timing of the crawl process if immediate download might not occur. +- **Error Handling:** Implement proper error handling to gracefully manage failed downloads or incorrect file paths. +- **Security:** Downloaded files should be scanned for potential security threats before use. + + + +This guide provides a foundation for handling downloads with Crawl4AI. You can adapt these techniques to manage downloads in various scenarios and integrate them into more complex crawling workflows. diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md index f4904915..95b8a397 100644 --- a/docs/md_v2/basic/quickstart.md +++ b/docs/md_v2/basic/quickstart.md @@ -8,7 +8,7 @@ First, let's import the necessary modules and create an instance of `AsyncWebCra ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CasheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -42,7 +42,7 @@ async def capture_and_save_screenshot(url: str, output_path: str): result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) if result.success and result.screenshot: @@ -62,15 +62,15 @@ Crawl4AI supports multiple browser engines. Here's how to use different browsers ```python # Use Firefox async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) # Use WebKit async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) # Use Chromium (default) async with AsyncWebCrawler(verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) ``` ### User Simulation 🎭 @@ -81,7 +81,7 @@ Simulate real user behavior to avoid detection: async with AsyncWebCrawler(verbose=True, headless=True) as crawler: result = await crawler.arun( url="YOUR-URL-HERE", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, simulate_user=True, # Causes random mouse movements and clicks override_navigator=True # Makes the browser appear more like a real user ) @@ -99,7 +99,7 @@ async def main(): print(f"First crawl result: {result1.markdown[:100]}...") # Force to crawl again - result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True) + result2 = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS) print(f"Second crawl result: {result2.markdown[:100]}...") asyncio.run(main()) @@ -189,7 +189,7 @@ extraction_strategy = LLMExtractionStrategy( async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://paulgraham.com/love.html", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, extraction_strategy=extraction_strategy ) ``` @@ -239,7 +239,7 @@ async def crawl_dynamic_content(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, headless=False, ) @@ -254,7 +254,7 @@ Remove overlay elements and fit content appropriately: async with AsyncWebCrawler(headless=False) as crawler: result = await crawler.arun( url="your-url-here", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, word_count_threshold=10, remove_overlay_elements=True, screenshot=True @@ -282,7 +282,7 @@ async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", word_count_threshold=0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, verbose=False, ) end = time.time() diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/basic/simple-crawling.md index 097d5e61..871fa64c 100644 --- a/docs/md_v2/basic/simple-crawling.md +++ b/docs/md_v2/basic/simple-crawling.md @@ -12,7 +12,9 @@ from crawl4ai import AsyncWebCrawler async def main(): async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com") + result = await crawler.arun( + url="https://example.com" + ) print(result.markdown) # Print clean markdown content if __name__ == "__main__": @@ -24,7 +26,7 @@ if __name__ == "__main__": The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details): ```python -result = await crawler.arun(url="https://example.com") +result = await crawler.arun(url="https://example.com", fit_markdown=True) # Different content formats print(result.html) # Raw HTML @@ -81,7 +83,7 @@ Here's a more comprehensive example showing common usage patterns: ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -97,7 +99,7 @@ async def main(): remove_overlay_elements=True, # Cache control - bypass_cache=False # Use cache if available + cache_mode=CacheMode.ENABLE # Use cache if available ) if result.success: diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md index 3682425f..a9f00e92 100644 --- a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md +++ b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md @@ -52,7 +52,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove extraction_type="schema", instruction="Extract model names and fees for input and output tokens from the page." ), - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -98,7 +98,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove result = await crawler.arun( url="https://example.com/some-article", extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md index 9f1c00ea..6100ae4c 100644 --- a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md +++ b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md @@ -55,7 +55,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -103,7 +103,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` diff --git a/docs/md_v2/tutorial/tutorial.md b/docs/md_v2/tutorial/tutorial.md index bf355ed0..7bead842 100644 --- a/docs/md_v2/tutorial/tutorial.md +++ b/docs/md_v2/tutorial/tutorial.md @@ -26,7 +26,7 @@ Here's a condensed outline of the **Installation and Setup** video content: - Walk through a simple test script to confirm the setup: ```python import asyncio - from crawl4ai import AsyncWebCrawler + from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -1093,7 +1093,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove extraction_type="schema", instruction="Extract model names and fees for input and output tokens from the page." ), - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1139,7 +1139,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove result = await crawler.arun( url="https://example.com/some-article", extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1248,7 +1248,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1296,7 +1296,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` From 152ac35bc2805610863d1f13efe8434fe2d290bd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Nov 2024 21:09:26 +0800 Subject: [PATCH 31/50] feat(docs): update README for version 0.3.74 with new features and improvements fix(version): update version number to 0.3.74 refactor(async_webcrawler): enhance logging and add domain-based request delay --- README.md | 16 +++++------ crawl4ai/__version__.py | 2 +- crawl4ai/async_crawler_strategy.py | 4 +-- crawl4ai/async_webcrawler.py | 43 +++++++++++++++++++++++++----- crawl4ai/config.py | 1 + 5 files changed, 47 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 9c3796cd..f6c8dc08 100644 --- a/README.md +++ b/README.md @@ -13,17 +13,15 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! -- 📥 **Download Mastery:** Control downloads, specify folders, and track files within the `CrawlResult` object. -- 🔎 **Relevance Filtering:** Extract the most important content with the new `RelevanceContentFilter` and BM25 algorithm. Control filtering with the `fit_markdown` flag. +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. +- 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. -- 🤖 **Browser Boss:** Manage browser sessions with persistent contexts, process monitoring, and tf-playwright-stealth integration. Configure using `use_managed_browser`, `user_data_dir`, and `use_persistent_context` parameters. +- 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. - ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. -- 🔒 **API Security:** Protect your API server with token-based authentication using the `CRAWL4AI_API_TOKEN` environment variable. -- 🔄 **Synchronous & Direct Crawling:** Get immediate results with `/crawl_sync` or bypass the task queue with `/crawl_direct`. -- 🛠️ **Database Migration:** A new `crawl4ai-migrate` command ensures smooth upgrades and data integrity between versions. -- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. - +- 🐳 **API Gateway:** Run Crawl4AI as a local or cloud API service, enabling cross-platform usage through a containerized server with secure token authentication via `CRAWL4AI_API_TOKEN`. +- 🛠️ **Database Improvements:** Enhanced database system for handling larger content sets with improved caching and faster performance. +- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. ## Try it Now! diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 7ab71c9b..65ee6e73 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.731" \ No newline at end of file +__version__ = "0.3.74" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a67591af..90d5cbe8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -605,7 +605,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, accept_downloads=self.accept_downloads, - downloads_path=self.downloads_path if self.accept_downloads else None + # downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) @@ -905,7 +905,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) return response except Error as e: - raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") + raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") # finally: # if not session_id: # await page.close() diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d22e3b1f..79a17ac4 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -15,15 +15,19 @@ from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy + from .config import ( MIN_WORD_THRESHOLD, - IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + URL_LOG_SHORTEN_LENGTH ) from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html ) +from urllib.parse import urlparse +import random from .__version__ import __version__ as crawl4ai_version @@ -51,6 +55,7 @@ class AsyncWebCrawler: To disable deprecation warnings: Pass warning=False to suppress the warning. """ + _domain_last_hit = {} def __init__( self, @@ -248,7 +253,7 @@ class AsyncWebCrawler: screenshot_data = async_response.screenshot t2 = time.perf_counter() if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") # Process the HTML content crawl_result = await self.aprocess_html( @@ -283,7 +288,7 @@ class AsyncWebCrawler: crawl_result.session_id = kwargs.get("session_id", None) if verbose: - print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url} | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") # Update cache if appropriate @@ -295,7 +300,7 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url} | {e.msg}{Style.RESET_ALL}") + print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") return CrawlResult( url=url, html="", @@ -350,10 +355,29 @@ class AsyncWebCrawler: if cache_mode is None: cache_mode = CacheMode.BYPASS - semaphore_count = kwargs.get('semaphore_count', 5) + semaphore_count = kwargs.get('semaphore_count', 10) semaphore = asyncio.Semaphore(semaphore_count) async def crawl_with_semaphore(url): + domain = urlparse(url).netloc + current_time = time.time() + + print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + + # Get delay settings from kwargs or use defaults + mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay + max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay + + # Check if we need to wait + if domain in self._domain_last_hit: + time_since_last = current_time - self._domain_last_hit[domain] + if time_since_last < mean_delay: + delay = mean_delay + random.uniform(0, max_range) + await asyncio.sleep(delay) + + # Update last hit time + self._domain_last_hit[domain] = current_time + async with semaphore: return await self.arun( url, @@ -369,8 +393,13 @@ class AsyncWebCrawler: **kwargs, ) + # Print start message + print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + start_time = time.perf_counter() tasks = [crawl_with_semaphore(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.perf_counter() + print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") return [result if not isinstance(result, Exception) else str(result) for result in results] @@ -423,7 +452,7 @@ class AsyncWebCrawler: metadata = result.get("metadata", {}) if verbose: - print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url}{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") @@ -439,7 +468,7 @@ class AsyncWebCrawler: extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) if verbose: - print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url}{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 6b1324dd..786ca4e5 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -55,4 +55,5 @@ IMAGE_SCORE_THRESHOLD = 2 MAX_METRICS_HISTORY = 1000 NEED_MIGRATION = True +URL_LOG_SHORTEN_LENGTH = 30 SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file From 852729ff380f0568d6874bc960606ba3cce0e935 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 18 Nov 2024 21:00:06 +0800 Subject: [PATCH 32/50] feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile feat(requirements): update requirements.txt to include snowballstemmer fix(version_manager): correct version parsing to use __version__.__version__ feat(main): introduce chunking strategy and content filter in CrawlRequest model feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance feat(logger): implement new async logger engine replacing print statements throughout library fix(database): resolve version-related deadlock and circular lock issues in database operations docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose --- Dockerfile | 12 +- crawl4ai/async_crawler_strategy.py | 149 ++++++++++++---- crawl4ai/async_database.py | 189 +++++++++++++------- crawl4ai/async_logger.py | 231 +++++++++++++++++++++++++ crawl4ai/async_webcrawler.py | 144 +++++++++++---- crawl4ai/content_filter_strategy.py | 71 ++++---- crawl4ai/content_scrapping_strategy.py | 44 ++++- crawl4ai/version_manager.py | 4 +- docker-compose.hub.yml | 27 +++ docker-compose.local.yml | 33 ++++ docker-compose.yml | 47 ++++- docs/examples/v0.3.74.overview.py | 119 +++++++++---- docs/md_v2/basic/docker-deploymeny.md | 88 ++++++++++ main.py | 23 ++- requirements.txt | 3 +- 15 files changed, 952 insertions(+), 232 deletions(-) create mode 100644 crawl4ai/async_logger.py create mode 100644 docker-compose.hub.yml create mode 100644 docker-compose.local.yml diff --git a/Dockerfile b/Dockerfile index aac2280a..bd71deae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libatspi2.0-0 \ && rm -rf /var/lib/apt/lists/* -# GPU support if enabled -RUN if [ "$ENABLE_GPU" = "true" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ +# GPU support if enabled and architecture is supported +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ + else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ fi # Create and set working directory diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 90d5cbe8..a6ba8e50 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -35,13 +35,15 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None self.debugging_port = 9222 + self.logger = logger + self.shutting_down = False async def start(self) -> str: """ @@ -76,15 +78,38 @@ class ManagedBrowser: async def _monitor_browser_process(self): """Monitor the browser process for unexpected termination.""" if self.browser_process: - stdout, stderr = await asyncio.gather( - asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read) - ) - if self.browser_process.poll() is not None: - print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}") - print(f"STDOUT: {stdout.decode()}") - print(f"STDERR: {stderr.decode()}") - await self.cleanup() + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode() + } + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode} + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)} + ) def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" @@ -134,20 +159,39 @@ class ManagedBrowser: async def cleanup(self): """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + if self.browser_process: try: self.browser_process.terminate() - await asyncio.sleep(1) + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running if self.browser_process.poll() is None: self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + except Exception as e: - print(f"Error terminating browser: {e}") + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)} + ) if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) except Exception as e: - print(f"Error removing temporary directory: {e}") + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)} + ) class AsyncCrawlerStrategy(ABC): @@ -172,7 +216,8 @@ class AsyncCrawlerStrategy(ABC): pass class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): + def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): + self.logger = logger self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", @@ -231,7 +276,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.managed_browser = ManagedBrowser( browser_type=self.browser_type, user_data_dir=self.user_data_dir, - headless=self.headless + headless=self.headless, + logger=self.logger ) cdp_url = await self.managed_browser.start() self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) @@ -282,6 +328,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Add extra args if provided if self.extra_args: browser_args["args"].extend(self.extra_args) + + # Add downloads path if downloads are enabled + if self.accept_downloads: + browser_args["downloads_path"] = self.downloads_path # Add proxy settings if a proxy is specified if self.proxy: @@ -344,6 +394,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.browser = None if self.managed_browser: + await asyncio.sleep(0.5) await self.managed_browser.cleanup() self.managed_browser = None @@ -491,9 +542,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }} """) else: - print(f"Warning: Could not access content frame for iframe {i}") + # print(f"Warning: Could not access content frame for iframe {i}") + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", + params={"index": i} + ) except Exception as e: - print(f"Error processing iframe {i}: {str(e)}") + self.logger.error( + message="Error processing iframe {index}: {error}", + tag="ERROR", + params={"index": i, "error": str(e)} + ) + # print(f"Error processing iframe {i}: {str(e)}") # Return the page object return page @@ -620,7 +681,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=self.accept_downloads, ) await context.set_extra_http_headers(self.headers) @@ -917,17 +979,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): suggested_filename = download.suggested_filename download_path = os.path.join(self.downloads_path, suggested_filename) - if self.verbose: - print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}") + self.logger.info( + message="Downloading {filename} to {path}", + tag="FETCH", + params={"filename": suggested_filename, "path": download_path} + ) + start_time = time.perf_counter() await download.save_as(download_path) + end_time = time.perf_counter() self._downloaded_files.append(download_path) - - if self.verbose: - print(f"[LOG] ✅ Downloaded {suggested_filename} successfully") + + self.logger.success( + message="Downloaded {filename} successfully", + tag="COMPLETE", + params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} + ) except Exception as e: - if self.verbose: - print(f"[ERROR] Failed to handle download: {str(e)}") + self.logger.error( + message="Failed to handle download: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + # if self.verbose: + # print(f"[ERROR] Failed to handle download: {str(e)}") async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed @@ -1070,8 +1146,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.evaluate(remove_overlays_js) await page.wait_for_timeout(500) # Wait for any animations to complete except Exception as e: - if self.verbose: - print(f"Warning: Failed to remove overlay elements: {str(e)}") + self.logger.warning( + message="Failed to remove overlay elements: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # if self.verbose: + # print(f"Warning: Failed to remove overlay elements: {str(e)}") async def take_screenshot(self, page: Page) -> str: """ @@ -1089,7 +1170,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # Generate an error image img = Image.new('RGB', (800, 600), color='black') @@ -1123,7 +1209,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + # print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) # Generate an error image img = Image.new('RGB', (800, 600), color='black') diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 7809dfe1..19160b6e 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -12,10 +12,12 @@ import xxhash import aiofiles from .config import NEED_MIGRATION from .version_manager import VersionManager +from .async_logger import AsyncLogger # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +base_directory = Path.home() DB_PATH = os.path.join(Path.home(), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") @@ -28,15 +30,21 @@ class AsyncDatabaseManager: self.max_retries = max_retries self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.pool_lock = asyncio.Lock() + self.init_lock = asyncio.Lock() self.connection_semaphore = asyncio.Semaphore(pool_size) self._initialized = False self.version_manager = VersionManager() + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"), + verbose=False, + tag_width=10 + ) async def initialize(self): """Initialize the database and connection pool""" try: - logger.info("Initializing database...") + self.logger.info("Initializing database", tag="INIT") # Ensure the database file exists os.makedirs(os.path.dirname(self.db_path), exist_ok=True) @@ -47,31 +55,39 @@ class AsyncDatabaseManager: await self.ainit_db() # Verify the table exists - async def verify_table(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: async with db.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" ) as cursor: result = await cursor.fetchone() if not result: raise Exception("crawled_data table was not created") - - await self.execute_with_retry(verify_table) # If version changed or fresh install, run updates if needs_update: - logger.info("New version detected, running updates...") + self.logger.info("New version detected, running updates", tag="INIT") await self.update_db_schema() from .migrations import run_migration # Import here to avoid circular imports await run_migration() self.version_manager.update_version() # Update stored version after successful migration - logger.info("Version update completed successfully") + self.logger.success("Version update completed successfully", tag="COMPLETE") else: - logger.info("Database initialization completed successfully") + self.logger.success("Database initialization completed successfully", tag="COMPLETE") + except Exception as e: - logger.error(f"Database initialization error: {e}") - logger.info("Database will be initialized on first use") + self.logger.error( + message="Database initialization error: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.logger.info( + message="Database will be initialized on first use", + tag="INIT" + ) + raise + async def cleanup(self): """Cleanup connections when shutting down""" @@ -84,34 +100,41 @@ class AsyncDatabaseManager: async def get_connection(self): """Connection pool manager""" if not self._initialized: - async with self.pool_lock: # Prevent multiple simultaneous initializations - if not self._initialized: # Double-check after acquiring lock + # Use an asyncio.Lock to ensure only one initialization occurs + async with self.init_lock: + if not self._initialized: await self.initialize() self._initialized = True - async with self.connection_semaphore: - task_id = id(asyncio.current_task()) - try: - async with self.pool_lock: - if task_id not in self.connection_pool: - conn = await aiosqlite.connect( - self.db_path, - timeout=30.0 - ) - await conn.execute('PRAGMA journal_mode = WAL') - await conn.execute('PRAGMA busy_timeout = 5000') - self.connection_pool[task_id] = conn - - yield self.connection_pool[task_id] - - except Exception as e: - logger.error(f"Connection error: {e}") - raise - finally: - async with self.pool_lock: - if task_id in self.connection_pool: - await self.connection_pool[task_id].close() - del self.connection_pool[task_id] + await self.connection_semaphore.acquire() + task_id = id(asyncio.current_task()) + try: + async with self.pool_lock: + if task_id not in self.connection_pool: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + self.connection_pool[task_id] = conn + + yield self.connection_pool[task_id] + + except Exception as e: + self.logger.error( + message="Connection error: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + raise + finally: + async with self.pool_lock: + if task_id in self.connection_pool: + await self.connection_pool[task_id].close() + del self.connection_pool[task_id] + self.connection_semaphore.release() async def execute_with_retry(self, operation, *args): @@ -124,13 +147,21 @@ class AsyncDatabaseManager: return result except Exception as e: if attempt == self.max_retries - 1: - logger.error(f"Operation failed after {self.max_retries} attempts: {e}") + self.logger.error( + message="Operation failed after {retries} attempts: {error}", + tag="ERROR", + force_verbose=True, + params={ + "retries": self.max_retries, + "error": str(e) + } + ) raise await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff async def ainit_db(self): """Initialize database schema""" - async def _init(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: await db.execute(''' CREATE TABLE IF NOT EXISTS crawled_data ( url TEXT PRIMARY KEY, @@ -147,36 +178,37 @@ class AsyncDatabaseManager: downloaded_files TEXT DEFAULT "{}" -- New column added ) ''') - - await self.execute_with_retry(_init) + await db.commit() + async def update_db_schema(self): """Update database schema if needed""" - async def _check_columns(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: cursor = await db.execute("PRAGMA table_info(crawled_data)") columns = await cursor.fetchall() - return [column[1] for column in columns] + column_names = [column[1] for column in columns] + + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] + + for column in new_columns: + if column not in column_names: + await self.aalter_db_add_column(column, db) + await db.commit() - column_names = await self.execute_with_retry(_check_columns) - - # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] - - for column in new_columns: - if column not in column_names: - await self.aalter_db_add_column(column) - - async def aalter_db_add_column(self, new_column: str): + async def aalter_db_add_column(self, new_column: str, db): """Add new column to the database""" - async def _alter(db): - if new_column == 'response_headers': - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') - else: - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') - logger.info(f"Added column '{new_column}' to the database.") + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + self.logger.info( + message="Added column '{column}' to the database", + tag="INIT", + params={"column": new_column} + ) - await self.execute_with_retry(_alter) async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: """Retrieve cached URL data as CrawlResult""" @@ -235,7 +267,12 @@ class AsyncDatabaseManager: try: return await self.execute_with_retry(_get) except Exception as e: - logger.error(f"Error retrieving cached URL: {e}") + self.logger.error( + message="Error retrieving cached URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return None async def acache_url(self, result: CrawlResult): @@ -291,7 +328,13 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_cache) except Exception as e: - logger.error(f"Error caching URL: {e}") + self.logger.error( + message="Error caching URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + async def aget_total_count(self) -> int: """Get total number of cached URLs""" @@ -303,7 +346,12 @@ class AsyncDatabaseManager: try: return await self.execute_with_retry(_count) except Exception as e: - logger.error(f"Error getting total count: {e}") + self.logger.error( + message="Error getting total count: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return 0 async def aclear_db(self): @@ -314,7 +362,12 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_clear) except Exception as e: - logger.error(f"Error clearing database: {e}") + self.logger.error( + message="Error clearing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) async def aflush_db(self): """Drop the entire table""" @@ -324,7 +377,12 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_flush) except Exception as e: - logger.error(f"Error flushing database: {e}") + self.logger.error( + message="Error flushing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) async def _store_content(self, content: str, content_type: str) -> str: @@ -352,7 +410,12 @@ class AsyncDatabaseManager: async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: return await f.read() except: - logger.error(f"Failed to load content: {file_path}") + self.logger.error( + message="Failed to load content: {file_path}", + tag="ERROR", + force_verbose=True, + params={"file_path": file_path} + ) return None # Create a singleton instance diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py new file mode 100644 index 00000000..220edd11 --- /dev/null +++ b/crawl4ai/async_logger.py @@ -0,0 +1,231 @@ +from enum import Enum +from typing import Optional, Dict, Any, Union +from colorama import Fore, Back, Style, init +import time +import os +from datetime import datetime + +class LogLevel(Enum): + DEBUG = 1 + INFO = 2 + SUCCESS = 3 + WARNING = 4 + ERROR = 5 + +class AsyncLogger: + """ + Asynchronous logger with support for colored console output and file logging. + Supports templated messages with colored components. + """ + + DEFAULT_ICONS = { + 'INIT': '→', + 'READY': '✓', + 'FETCH': '↓', + 'SCRAPE': '◆', + 'EXTRACT': '■', + 'COMPLETE': '●', + 'ERROR': '×', + 'DEBUG': '⋯', + 'INFO': 'ℹ', + 'WARNING': '⚠', + } + + DEFAULT_COLORS = { + LogLevel.DEBUG: Fore.LIGHTBLACK_EX, + LogLevel.INFO: Fore.CYAN, + LogLevel.SUCCESS: Fore.GREEN, + LogLevel.WARNING: Fore.YELLOW, + LogLevel.ERROR: Fore.RED, + } + + def __init__( + self, + log_file: Optional[str] = None, + log_level: LogLevel = LogLevel.INFO, + tag_width: int = 10, + icons: Optional[Dict[str, str]] = None, + colors: Optional[Dict[LogLevel, str]] = None, + verbose: bool = True + ): + """ + Initialize the logger. + + Args: + log_file: Optional file path for logging + log_level: Minimum log level to display + tag_width: Width for tag formatting + icons: Custom icons for different tags + colors: Custom colors for different log levels + verbose: Whether to output to console + """ + init() # Initialize colorama + self.log_file = log_file + self.log_level = log_level + self.tag_width = tag_width + self.icons = icons or self.DEFAULT_ICONS + self.colors = colors or self.DEFAULT_COLORS + self.verbose = verbose + + # Create log file directory if needed + if log_file: + os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) + + def _format_tag(self, tag: str) -> str: + """Format a tag with consistent width.""" + return f"[{tag}]".ljust(self.tag_width, ".") + + def _get_icon(self, tag: str) -> str: + """Get the icon for a tag, defaulting to info icon if not found.""" + return self.icons.get(tag, self.icons['INFO']) + + def _write_to_file(self, message: str): + """Write a message to the log file if configured.""" + if self.log_file: + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + with open(self.log_file, 'a', encoding='utf-8') as f: + # Strip ANSI color codes for file output + clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '') + for color in vars(Fore).values(): + if isinstance(color, str): + clean_message = clean_message.replace(color, '') + f.write(f"[{timestamp}] {clean_message}\n") + + def _log( + self, + level: LogLevel, + message: str, + tag: str, + params: Optional[Dict[str, Any]] = None, + colors: Optional[Dict[str, str]] = None, + base_color: Optional[str] = None, + **kwargs + ): + """ + Core logging method that handles message formatting and output. + + Args: + level: Log level for this message + message: Message template string + tag: Tag for the message + params: Parameters to format into the message + colors: Color overrides for specific parameters + base_color: Base color for the entire message + """ + if level.value < self.log_level.value: + return + + # Format the message with parameters if provided + if params: + try: + # First format the message with raw parameters + formatted_message = message.format(**params) + + # Then apply colors if specified + if colors: + for key, color in colors.items(): + # Find the formatted value in the message and wrap it with color + if key in params: + value_str = str(params[key]) + formatted_message = formatted_message.replace( + value_str, + f"{color}{value_str}{Style.RESET_ALL}" + ) + + except KeyError as e: + formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template" + level = LogLevel.ERROR + else: + formatted_message = message + + # Construct the full log line + color = base_color or self.colors[level] + log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}" + + # Output to console if verbose + if self.verbose or kwargs.get("force_verbose", False): + print(log_line) + + # Write to file if configured + self._write_to_file(log_line) + + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + """Log a debug message.""" + self._log(LogLevel.DEBUG, message, tag, **kwargs) + + def info(self, message: str, tag: str = "INFO", **kwargs): + """Log an info message.""" + self._log(LogLevel.INFO, message, tag, **kwargs) + + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + """Log a success message.""" + self._log(LogLevel.SUCCESS, message, tag, **kwargs) + + def warning(self, message: str, tag: str = "WARNING", **kwargs): + """Log a warning message.""" + self._log(LogLevel.WARNING, message, tag, **kwargs) + + def error(self, message: str, tag: str = "ERROR", **kwargs): + """Log an error message.""" + self._log(LogLevel.ERROR, message, tag, **kwargs) + + def url_status( + self, + url: str, + success: bool, + timing: float, + tag: str = "FETCH", + url_length: int = 50 + ): + """ + Convenience method for logging URL fetch status. + + Args: + url: The URL being processed + success: Whether the operation was successful + timing: Time taken for the operation + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.SUCCESS if success else LogLevel.ERROR, + message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "status": success, + "timing": timing + }, + colors={ + "status": Fore.GREEN if success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + def error_status( + self, + url: str, + error: str, + tag: str = "ERROR", + url_length: int = 50 + ): + """ + Convenience method for logging error status. + + Args: + url: The URL being processed + error: Error message + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.ERROR, + message="{url:.{url_length}}... | Error: {error}", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "error": error + } + ) \ No newline at end of file diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 79a17ac4..5fe7822c 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -15,6 +15,7 @@ from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy +from .async_logger import AsyncLogger from .config import ( MIN_WORD_THRESHOLD, @@ -74,19 +75,29 @@ class AsyncWebCrawler: always_by_pass_cache: Deprecated, use always_bypass_cache instead base_directory: Base directory for storing cache """ - init() - self.log_width = 10 # Width of "[COMPLETE]" - self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") - self.log_icons = { - 'INIT': '→', # Alternative: '▶' or '►' - 'READY': '✓', # Alternative: '√' - 'FETCH': '↓', # Alternative: '▼' - 'SCRAPE': '◆', # Alternative: '♦' - 'EXTRACT': '■', # Alternative: '□' - 'COMPLETE': '●', # Alternative: '○' - 'ERROR': '×' - } - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs) + # init() + # self.log_width = 10 # Width of "[COMPLETE]" + # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") + # self.log_icons = { + # 'INIT': '→', # Alternative: '▶' or '►' + # 'READY': '✓', # Alternative: '√' + # 'FETCH': '↓', # Alternative: '▼' + # 'SCRAPE': '◆', # Alternative: '♦' + # 'EXTRACT': '■', # Alternative: '□' + # 'COMPLETE': '●', # Alternative: '○' + # 'ERROR': '×' + # } + self.verbose = kwargs.get("verbose", False) + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), + verbose=self.verbose, + tag_width=10 + ) + + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( + logger = self.logger, + **kwargs + ) # Handle deprecated parameter if always_by_pass_cache is not None: @@ -118,12 +129,13 @@ class AsyncWebCrawler: async def awarmup(self): """Initialize the crawler with warm-up sequence.""" - if self.verbose: - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + # if self.verbose: + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") self.ready = True - if self.verbose: - print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") + # if self.verbose: + # print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") async def arun( self, @@ -234,8 +246,14 @@ class AsyncWebCrawler: screenshot_data = cached_result.screenshot if not screenshot_data: cached_result = None - if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" + ) # Fetch fresh content if needed @@ -252,8 +270,14 @@ class AsyncWebCrawler: html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot t2 = time.perf_counter() - if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=t2 - t1, + tag="FETCH" + ) + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") # Process the HTML content crawl_result = await self.aprocess_html( @@ -287,9 +311,21 @@ class AsyncWebCrawler: crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) - if verbose: - print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - + # if verbose: + # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) # Update cache if appropriate if cache_context.should_write() and not bool(cached_result): @@ -300,7 +336,12 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + self.logger.error_status( + url=cache_context.display_url, + error=e.msg, + tag="ERROR" + ) return CrawlResult( url=url, html="", @@ -362,7 +403,12 @@ class AsyncWebCrawler: domain = urlparse(url).netloc current_time = time.time() - print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + self.logger.debug( + message="Started task for {url:.50}...", + tag="PARALLEL", + params={"url": url} + ) # Get delay settings from kwargs or use defaults mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay @@ -394,12 +440,26 @@ class AsyncWebCrawler: ) # Print start message - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + self.logger.info( + message="Starting concurrent crawling for {count} URLs...", + tag="INIT", + params={"count": len(urls)} + ) start_time = time.perf_counter() tasks = [crawl_with_semaphore(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) end_time = time.perf_counter() - print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") + # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL, + tag="COMPLETE", + params={ + "count": len(urls), + "timing": f"{end_time - start_time:.2f}s" + }, + colors={"timing": Fore.YELLOW} + ) return [result if not isinstance(result, Exception) else str(result) for result in results] @@ -451,9 +511,16 @@ class AsyncWebCrawler: links = result.get("links", []) metadata = result.get("metadata", {}) - if verbose: - print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") - + # if verbose: + # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + self.logger.info( + message="Processed {url:.50}... | Time: {timing}ms", + tag="SCRAPE", + params={ + "url": _url, + "timing": int((time.perf_counter() - t1) * 1000) + } + ) if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): @@ -467,8 +534,17 @@ class AsyncWebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - if verbose: - print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + # if verbose: + # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + self.logger.info( + message="Completed for {url:.50}... | Time: {timing}s", + tag="EXTRACT", + params={ + "url": _url, + "timing": time.perf_counter() - t1 + } + ) + diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 88375da9..88216f7f 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -8,6 +8,10 @@ from bs4 import BeautifulSoup, NavigableString, Tag from .utils import clean_tokens from abc import ABC, abstractmethod +from snowballstemmer import stemmer + +# from nltk.stem import PorterStemmer +# ps = PorterStemmer() class RelevantContentFilter(ABC): def __init__(self, user_query: str = None): self.user_query = user_query @@ -252,7 +256,7 @@ class RelevantContentFilter(ABC): return str(tag) # Fallback to original if anything fails class BM25ContentFilter(RelevantContentFilter): - def __init__(self, user_query: str = None, bm25_threshold: float = 1.0): + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'): super().__init__(user_query=user_query) self.bm25_threshold = bm25_threshold self.priority_tags = { @@ -268,6 +272,7 @@ class BM25ContentFilter(RelevantContentFilter): 'pre': 1.5, 'th': 1.5, # Table headers } + self.stemmer = stemmer(language) def filter_content(self, html: str) -> List[str]: """Implements content filtering using BM25 algorithm with priority tag handling""" @@ -282,58 +287,42 @@ class BM25ContentFilter(RelevantContentFilter): if not candidates: return [] - # Split into priority and regular candidates - priority_candidates = [] - regular_candidates = [] + # Tokenize corpus + # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates] + # tokenized_query = query.lower().split() + + # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] + # for _, chunk, _, _ in candidates] + # tokenized_query = [ps.stem(word) for word in query.lower().split()] - for index, chunk, tag_type, tag in candidates: - if tag.name in self.priority_tags: - priority_candidates.append((index, chunk, tag_type, tag)) - else: - regular_candidates.append((index, chunk, tag_type, tag)) + tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] + for _, chunk, _, _ in candidates] + tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()] - # Process regular content with BM25 - tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates] - tokenized_query = query.lower().split() - # Clean from stop words and noise tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] tokenized_query = clean_tokens(tokenized_query) - + bm25 = BM25Okapi(tokenized_corpus) scores = bm25.get_scores(tokenized_query) - # Score and boost regular candidates - scored_candidates = [ - (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag) - for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates) + # Adjust scores with tag weights + adjusted_candidates = [] + for score, (index, chunk, tag_type, tag) in zip(scores, candidates): + tag_weight = self.priority_tags.get(tag.name, 1.0) + adjusted_score = score * tag_weight + adjusted_candidates.append((adjusted_score, index, chunk, tag)) + + # Filter candidates by threshold + selected_candidates = [ + (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates + if adjusted_score >= self.bm25_threshold ] - scored_candidates.sort(key=lambda x: x[0], reverse=True) - - # Process scored candidates - selected_tags = set() - selected_candidates = [] - - # First add all priority candidates - for index, chunk, tag_type, tag in priority_candidates: - tag_id = id(tag) - if tag_id not in selected_tags: - selected_candidates.append((index, chunk, tag)) - selected_tags.add(tag_id) - - # Then add scored regular candidates that meet threshold - for score, index, chunk, tag_type, tag in scored_candidates: - if score < self.bm25_threshold: - continue - tag_id = id(tag) - if tag_id not in selected_tags: - selected_candidates.append((index, chunk, tag)) - selected_tags.add(tag_id) if not selected_candidates: return [] - # Sort by original document order + # Sort selected candidates by original document order selected_candidates.sort(key=lambda x: x[0]) - return [self.clean_element(tag) for _, _, tag in selected_candidates] + return [self.clean_element(tag) for _, _, tag in selected_candidates] diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index d16b0680..0f470671 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -149,6 +149,15 @@ class ContentScrapingStrategy(ABC): pass class WebScrapingStrategy(ContentScrapingStrategy): + def __init__(self, logger=None): + self.logger = logger + + def _log(self, level, message, tag="SCRAPE", **kwargs): + """Helper method to safely use logger.""" + if self.logger: + log_method = getattr(self.logger, level) + log_method(message=message, tag=tag, **kwargs) + def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) @@ -167,7 +176,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: meta = extract_metadata("", soup) except Exception as e: - print('Error extracting metadata:', str(e)) + self._log('error', + message="Error extracting metadata: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # print('Error extracting metadata:', str(e)) meta = {} @@ -430,9 +444,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) except Exception as e: - print('Error removing unwanted attributes:', str(e)) - - + # print('Error removing unwanted attributes:', str(e)) + self._log('error', + message="Error removing unwanted attributes: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) # Process children for child in list(element.children): if isinstance(child, NavigableString) and not isinstance(child, Comment): @@ -453,7 +470,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): return keep_element except Exception as e: - print('Error processing element:', str(e)) + # print('Error processing element:', str(e)) + self._log('error', + message="Error processing element: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) return False process_element(body) @@ -516,7 +538,10 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = body.encode_contents().decode('utf-8') print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") - + self._log('error', + message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.", + tag="SCRAPE" + ) cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') @@ -525,6 +550,13 @@ class WebScrapingStrategy(ContentScrapingStrategy): h.update_params(**kwargs.get('html2text', {})) markdown = h.handle(cleaned_html) except Exception as e: + if not h: + h = CustomHTML2Text() + self._log('error', + message="Error converting HTML to markdown: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) markdown = h.handle(sanitize_html(cleaned_html)) markdown = markdown.replace(' ```', '```') diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py index 07e0c0e9..8ae2de2e 100644 --- a/crawl4ai/version_manager.py +++ b/crawl4ai/version_manager.py @@ -20,11 +20,11 @@ class VersionManager: def update_version(self): """Update the version file to current library version""" - self.version_file.write_text(__version__) + self.version_file.write_text(__version__.__version__) def needs_update(self): """Check if database needs update based on version""" installed = self.get_installed_version() - current = version.parse(__version__) + current = version.parse(__version__.__version__) return installed is None or installed < current diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml new file mode 100644 index 00000000..9bcfa982 --- /dev/null +++ b/docker-compose.hub.yml @@ -0,0 +1,27 @@ +services: + crawl4ai: + image: unclecode/crawl4ai:basic # Pull image from Docker Hub + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml new file mode 100644 index 00000000..7dc41b47 --- /dev/null +++ b/docker-compose.local.yml @@ -0,0 +1,33 @@ +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ef0dc9e4..1097ef11 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: crawl4ai: build: @@ -9,15 +7,18 @@ services: PYTHON_VERSION: 3.10 INSTALL_TYPE: all ENABLE_GPU: false + profiles: ["local"] ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations + - /dev/shm:/dev/shm deploy: resources: limits: @@ -30,4 +31,32 @@ services: interval: 30s timeout: 10s retries: 3 - start_period: 40s \ No newline at end of file + start_period: 40s + + crawl4ai-hub: + image: unclecode/crawl4ai:basic + profiles: ["hub"] + ports: + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py index ec3a7d73..00296740 100644 --- a/docs/examples/v0.3.74.overview.py +++ b/docs/examples/v0.3.74.overview.py @@ -1,9 +1,16 @@ +import os, sys +# append the parent directory to the sys.path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +parent_parent_dir = os.path.dirname(parent_dir) +sys.path.append(parent_parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +__data__ = os.path.join(__location__, "__data") import asyncio -import os from pathlib import Path import aiohttp import json -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.content_filter_strategy import BM25ContentFilter # 1. File Download Processing Example @@ -32,7 +39,8 @@ async def download_example(): console.log('No .exe download link found'); } """, - wait_for=5 # Wait 5 seconds to ensure download starts + delay_before_return_html=1, # Wait 5 seconds to ensure download starts + cache_mode=CacheMode.BYPASS ) if result.downloaded_files: @@ -50,22 +58,32 @@ async def content_filtering_example(): async with AsyncWebCrawler(verbose=True) as crawler: # Create filter with custom query for OpenAI's blog content_filter = BM25ContentFilter( - user_query="AI language models research innovation", + # user_query="Investment and fundraising", + # user_query="Robotic", bm25_threshold=1.0 ) result = await crawler.arun( - url="https://openai.com/blog", - content_filter=content_filter + url="https://techcrunch.com/", + content_filter=content_filter, + cache_mode=CacheMode.BYPASS ) - print(f"Filtered content: {result.extracted_content}") + print(f"Filtered content: {len(result.fit_markdown)}") + print(f"Filtered content: {result.fit_markdown}") + + # Save html + with open(os.path.join(__data__, "techcrunch.html"), "w") as f: + f.write(result.fit_html) + + with open(os.path.join(__data__, "filtered_content.md"), "w") as f: + f.write(result.fit_markdown) # 3. Local File and Raw HTML Processing Example async def local_and_raw_html_example(): """Example of processing local files and raw HTML""" # Create a sample HTML file - sample_file = "sample.html" + sample_file = os.path.join(__data__, "sample.html") with open(sample_file, "w") as f: f.write(""" @@ -112,21 +130,18 @@ async def browser_management_example(): headless=False, verbose=True ) as crawler: + + result = await crawler.arun( + url="https://crawl4ai.com", + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS + ) # Use GitHub as an example - it's a good test for browser management # because it requires proper browser handling result = await crawler.arun( url="https://github.com/trending", - session_id="persistent_session_1", - js_code=""" - // Custom JavaScript to execute on GitHub's trending page - const repos = document.querySelectorAll('article.Box-row'); - const data = Array.from(repos).map(repo => ({ - name: repo.querySelector('h2')?.textContent?.trim(), - description: repo.querySelector('p')?.textContent?.trim(), - language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim() - })); - console.log('Trending repositories:', JSON.stringify(data, null, 2)); - """ + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS ) print("\nBrowser session result:", result.success) @@ -136,6 +151,8 @@ async def browser_management_example(): # 5. API Usage Example async def api_example(): """Example of using the new API endpoints""" + api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" + headers = {'Authorization': f'Bearer {api_token}'} async with aiohttp.ClientSession() as session: # Submit crawl job crawl_request = { @@ -143,52 +160,78 @@ async def api_example(): "extraction_config": { "type": "json_css", "params": { - "selectors": { - "titles": ".title a", - "scores": ".score", - "comments": ".comment-tree" + "schema": { + "name": "Hacker News Articles", + "baseSelector": ".athing", + "fields": [ + { + "name": "title", + "selector": ".title a", + "type": "text" + }, + { + "name": "score", + "selector": ".score", + "type": "text" + }, + { + "name": "url", + "selector": ".title a", + "type": "attribute", + "attribute": "href" + } + ] } } }, "crawler_params": { "headless": True, - "use_managed_browser": True + # "use_managed_browser": True }, - "screenshot": True, - "magic": True + "cache_mode": "bypass", + # "screenshot": True, + # "magic": True } async with session.post( "http://localhost:11235/crawl", - json=crawl_request + json=crawl_request, + headers=headers ) as response: task_data = await response.json() task_id = task_data["task_id"] # Check task status - async with session.get( - f"http://localhost:11235/task/{task_id}" - ) as status_response: - result = await status_response.json() - print(f"Task result: {result}") + while True: + async with session.get( + f"http://localhost:11235/task/{task_id}", + headers=headers + ) as status_response: + result = await status_response.json() + print(f"Task result: {result}") + + if result["status"] == "completed": + break + else: + await asyncio.sleep(1) # Main execution async def main(): - print("Running Crawl4AI feature examples...") + # print("Running Crawl4AI feature examples...") - print("\n1. Running Download Example:") + # print("\n1. Running Download Example:") await download_example() - print("\n2. Running Content Filtering Example:") + # print("\n2. Running Content Filtering Example:") await content_filtering_example() - print("\n3. Running Local and Raw HTML Example:") + # print("\n3. Running Local and Raw HTML Example:") await local_and_raw_html_example() - print("\n4. Running Browser Management Example:") + # print("\n4. Running Browser Management Example:") await browser_management_example() - print("\n5. Running API Example:") + # print("\n5. Running API Example:") await api_example() if __name__ == "__main__": diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index 30555708..87e468aa 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic ``` +## Running with Docker Compose 🐳 + +### Use Docker Compose (From Local Dockerfile or Docker Hub) + +Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub. + +### **Option 1: Using Docker Compose to Build Locally** +If you want to build the image locally, use the provided `docker-compose.local.yml` file. + +```bash +docker-compose -f docker-compose.local.yml up -d +``` + +This will: +1. Build the Docker image from the provided `Dockerfile`. +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Option 2: Using Docker Compose with Pre-Built Image from Hub** +If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file. + +```bash +docker-compose -f docker-compose.hub.yml up -d +``` + +This will: +1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration). +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Stopping the Running Services** + +To stop the services started via Docker Compose, you can use: + +```bash +docker-compose -f docker-compose.local.yml down +# OR +docker-compose -f docker-compose.hub.yml down +``` + +If the containers don’t stop and the application is still running, check the running containers: + +```bash +docker ps +``` + +Find the `CONTAINER ID` of the running service and stop it forcefully: + +```bash +docker stop +``` + +--- + +### **Debugging with Docker Compose** + +- **Check Logs**: To view the container logs: + ```bash + docker-compose -f docker-compose.local.yml logs -f + ``` + +- **Remove Orphaned Containers**: If the service is still running unexpectedly: + ```bash + docker-compose -f docker-compose.local.yml down --remove-orphans + ``` + +- **Manually Remove Network**: If the network is still in use: + ```bash + docker network ls + docker network rm crawl4ai_default + ``` + +--- + +### Why Use Docker Compose? + +Docker Compose is the recommended way to deploy Crawl4AI because: +1. It simplifies multi-container setups. +2. Allows you to define environment variables, resources, and ports in a single file. +3. Makes it easier to switch between local development and production-ready images. + +For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent. + + + + ## API Security 🔒 ### Understanding CRAWL4AI_API_TOKEN diff --git a/main.py b/main.py index ee5f7fc6..6d217410 100644 --- a/main.py +++ b/main.py @@ -26,6 +26,7 @@ from enum import Enum from dataclasses import dataclass import json from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode +from crawl4ai.config import MIN_WORD_THRESHOLD from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, @@ -53,12 +54,20 @@ class ExtractionConfig(BaseModel): type: CrawlerType params: Dict[str, Any] = {} +class ChunkingStrategy(BaseModel): + type: str + params: Dict[str, Any] = {} + +class ContentFilter(BaseModel): + type: str = "bm25" + params: Dict[str, Any] = {} + class CrawlRequest(BaseModel): urls: Union[HttpUrl, List[HttpUrl]] + word_count_threshold: int = MIN_WORD_THRESHOLD extraction_config: Optional[ExtractionConfig] = None - crawler_params: Dict[str, Any] = {} - priority: int = Field(default=5, ge=1, le=10) - ttl: Optional[int] = 3600 + chunking_strategy: Optional[ChunkingStrategy] = None + content_filter: Optional[ContentFilter] = None js_code: Optional[List[str]] = None wait_for: Optional[str] = None css_selector: Optional[str] = None @@ -66,7 +75,10 @@ class CrawlRequest(BaseModel): magic: bool = False extra: Optional[Dict[str, Any]] = {} session_id: Optional[str] = None - cache_mode: Optional[CacheMode] = None + cache_mode: Optional[CacheMode] = CacheMode.ENABLED + priority: int = Field(default=5, ge=1, le=10) + ttl: Optional[int] = 3600 + crawler_params: Dict[str, Any] = {} @dataclass class TaskInfo: @@ -280,6 +292,7 @@ class CrawlerService: if isinstance(request.urls, list): results = await crawler.arun_many( urls=[str(url) for url in request.urls], + word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy=extraction_strategy, js_code=request.js_code, wait_for=request.wait_for, @@ -287,6 +300,7 @@ class CrawlerService: screenshot=request.screenshot, magic=request.magic, session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) else: @@ -299,6 +313,7 @@ class CrawlerService: screenshot=request.screenshot, magic=request.magic, session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) diff --git a/requirements.txt b/requirements.txt index e6294cc5..ed259ac9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ tf-playwright-stealth~=1.0 xxhash~=3.4 rank-bm25~=0.2 aiofiles~=24.0 -colorama~=0.4 \ No newline at end of file +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file From 73658c758affac33d1c96ce274735025012da370 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 16:10:43 +0800 Subject: [PATCH 33/50] chore: update .gitignore to include manage-collab.sh --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0acec10f..da4b5f88 100644 --- a/.gitignore +++ b/.gitignore @@ -210,4 +210,5 @@ git_issues.md .issues/ .docs/ .issues/ -.gitboss/ \ No newline at end of file +.gitboss/ +manage-collab.sh \ No newline at end of file From 593c7ad307489edc6a12f2f594bc7827aacbc6f7 Mon Sep 17 00:00:00 2001 From: ntohidikplay <“nasrin@kplay”.team> Date: Tue, 19 Nov 2024 11:45:26 +0100 Subject: [PATCH 34/50] test: trying to push to main --- test.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test.txt diff --git a/test.txt b/test.txt new file mode 100644 index 00000000..e69de29b From 3aae30ed2a2fdd57e1bb9b6374238247d1013974 Mon Sep 17 00:00:00 2001 From: ntohidikplay <“nasrin@kplay”.team> Date: Tue, 19 Nov 2024 11:57:07 +0100 Subject: [PATCH 35/50] test1: trying to push to main --- test1.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test1.txt diff --git a/test1.txt b/test1.txt new file mode 100644 index 00000000..e69de29b From 2f19d386930b48f6758053dd4791b3da9e3a0f29 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:02:41 +0800 Subject: [PATCH 36/50] Update .gitignore to include .gitboss/ and todo_executor.md --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4c3e151e..b92a0b0d 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,6 @@ git_issues.md .tests/ .issues/ .docs/ -.issues/ \ No newline at end of file +.issues/ +.gitboss/ +todo_executor.md \ No newline at end of file From fbcff85ecb6d189fe77ca979017de9e3415481ce Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:03:23 +0800 Subject: [PATCH 37/50] Remove test files --- test.txt | 0 test1.txt | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test.txt delete mode 100644 test1.txt diff --git a/test.txt b/test.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/test1.txt b/test1.txt deleted file mode 100644 index e69de29b..00000000 From a6dad3fc6d436af25f65c083c0f3cb2d6f8f9fc1 Mon Sep 17 00:00:00 2001 From: ntohidikplay <“nasrin@kplay”.team> Date: Tue, 19 Nov 2024 12:09:33 +0100 Subject: [PATCH 38/50] test: trying to push to 0.3.74 --- test3.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test3.txt diff --git a/test3.txt b/test3.txt new file mode 100644 index 00000000..e69de29b From f2cb7d506dbe78bd29d6d6b32bd56f43ec44b352 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:12:14 +0800 Subject: [PATCH 39/50] Delete test3.txt --- test3.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test3.txt diff --git a/test3.txt b/test3.txt deleted file mode 100644 index e69de29b..00000000 From b654c49e55194da47945e726fe18a5fbded68062 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:32:06 +0800 Subject: [PATCH 40/50] Update .gitignore to exclude additional scripts and files --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b92a0b0d..de75f544 100644 --- a/.gitignore +++ b/.gitignore @@ -210,4 +210,6 @@ git_issues.md .docs/ .issues/ .gitboss/ -todo_executor.md \ No newline at end of file +todo_executor.md +protect-all-except-feature.sh +manage-collab.sh \ No newline at end of file From 2bdec1fa5a8d13f66598e15ff37d45ef75d5e830 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 19 Nov 2024 19:33:04 +0800 Subject: [PATCH 41/50] chore: add manage-collab.sh to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index da4b5f88..0fb09933 100644 --- a/.gitignore +++ b/.gitignore @@ -211,4 +211,5 @@ git_issues.md .docs/ .issues/ .gitboss/ + manage-collab.sh \ No newline at end of file From d418a04602ebe32d68d248a2995488beec768c61 Mon Sep 17 00:00:00 2001 From: Darwing Medina Date: Wed, 20 Nov 2024 04:52:11 -0600 Subject: [PATCH 42/50] Fix #260 prevent pass duplicated kwargs to scrapping_strategy (#269) Thank you for the suggestions. It totally makes sense now. Change to pop operator. --- crawl4ai/async_webcrawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 38e429ca..fb8c5290 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -197,8 +197,8 @@ class AsyncWebCrawler: html, word_count_threshold=word_count_threshold, css_selector=css_selector, - only_text=kwargs.get("only_text", False), - image_description_min_word_threshold=kwargs.get( + only_text=kwargs.pop("only_text", False), + image_description_min_word_threshold=kwargs.pop( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), **kwargs, From 3439f7886d170e05e0c97c804b1057187325c2a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Wed, 20 Nov 2024 20:30:25 +0800 Subject: [PATCH 43/50] fix: crawler strategy exception handling and fixes (#271) --- crawl4ai/crawler_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index ce802e49..898dcfa8 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -283,7 +283,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): print(f"[LOG] ✅ Crawled {url} successfully!") return html - except InvalidArgumentException: + except InvalidArgumentException as e: if not hasattr(e, 'msg'): e.msg = sanitize_input_encode(str(e)) raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") From dbb751c8f09f76ffce4046784c2cd2b0021de7d0 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 21 Nov 2024 18:21:43 +0800 Subject: [PATCH 44/50] In this commit, we introduce the new concept of MakrdownGenerationStrategy, which allows us to expand our future strategies to generate better markdown. Right now, we generate raw markdown as we were doing before. We have a new algorithm for fitting markdown based on BM25, and now we add the ability to refine markdown into a citation form. Our links will be extracted and replaced by a citation reference number, and then we will have reference sections at the very end; we add all the links with the descriptions. This format is more suitable for large language models. In case we don't need to pass links, we can reduce the size of the markdown significantly and also attach the list of references as a separate file to a large language model. This commit contains changes for this direction. --- crawl4ai/__init__.py | 1 + crawl4ai/async_crawler_strategy.py | 13 +- crawl4ai/async_database.3.73.py | 285 --------------- crawl4ai/async_webcrawler.3.73.py | 344 ------------------ crawl4ai/async_webcrawler.py | 9 +- ...rategy.py => content_scraping_strategy.py} | 229 ++++++------ crawl4ai/markdown_generation_strategy.py | 115 ++++++ crawl4ai/models.py | 13 +- crawl4ai/utils.py | 88 +++++ crawl4ai/web_crawler.py | 2 +- tests/async/test_content_scraper_strategy.py | 4 +- tests/async/test_markdown_genertor.py | 165 +++++++++ 12 files changed, 506 insertions(+), 762 deletions(-) delete mode 100644 crawl4ai/async_database.3.73.py delete mode 100644 crawl4ai/async_webcrawler.3.73.py rename crawl4ai/{content_scrapping_strategy.py => content_scraping_strategy.py} (84%) create mode 100644 crawl4ai/markdown_generation_strategy.py create mode 100644 tests/async/test_markdown_genertor.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index ad9475b4..0ccf13d8 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,6 +1,7 @@ # __init__.py from .async_webcrawler import AsyncWebCrawler, CacheMode + from .models import CrawlResult from .__version__ import __version__ # __version__ = "0.3.73" diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index e7dc9c54..3f332eb0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -229,6 +229,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.headless = kwargs.get("headless", True) self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) + self.cookies = kwargs.get("cookies", []) self.sessions = {} self.session_ttl = 1800 self.js_code = js_code @@ -295,6 +296,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Set up the default context if self.default_context: await self.default_context.set_extra_http_headers(self.headers) + if self.cookies: + await self.default_context.add_cookies(self.cookies) if self.accept_downloads: await self.default_context.set_default_timeout(60000) await self.default_context.set_default_navigation_timeout(60000) @@ -669,6 +672,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + if self.cookies: + await context.add_cookies(self.cookies) await context.set_extra_http_headers(self.headers) page = await context.new_page() self.sessions[session_id] = (context, page, time.time()) @@ -684,6 +689,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, ) + if self.cookies: + await context.add_cookies(self.cookies) await context.set_extra_http_headers(self.headers) if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): @@ -828,7 +835,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for js in js_code: await page.evaluate(js) - await page.wait_for_load_state('networkidle') + # await page.wait_for_timeout(100) + # Check for on execution event await self.execute_hook('on_execution_started', page) @@ -846,6 +854,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") + + # if not wait_for and js_code: + # await page.wait_for_load_state('networkidle', timeout=5000) # Update image dimensions update_image_dimensions_js = """ diff --git a/crawl4ai/async_database.3.73.py b/crawl4ai/async_database.3.73.py deleted file mode 100644 index f86c7f1d..00000000 --- a/crawl4ai/async_database.3.73.py +++ /dev/null @@ -1,285 +0,0 @@ -import os -from pathlib import Path -import aiosqlite -import asyncio -from typing import Optional, Tuple, Dict -from contextlib import asynccontextmanager -import logging -import json # Added for serialization/deserialization -from .utils import ensure_content_dirs, generate_content_hash -import xxhash -import aiofiles -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -DB_PATH = os.path.join(Path.home(), ".crawl4ai") -os.makedirs(DB_PATH, exist_ok=True) -DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") - -class AsyncDatabaseManager: - def __init__(self, pool_size: int = 10, max_retries: int = 3): - self.db_path = DB_PATH - self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) - self.pool_size = pool_size - self.max_retries = max_retries - self.connection_pool: Dict[int, aiosqlite.Connection] = {} - self.pool_lock = asyncio.Lock() - self.connection_semaphore = asyncio.Semaphore(pool_size) - - async def initialize(self): - """Initialize the database and connection pool""" - await self.ainit_db() - - async def cleanup(self): - """Cleanup connections when shutting down""" - async with self.pool_lock: - for conn in self.connection_pool.values(): - await conn.close() - self.connection_pool.clear() - - @asynccontextmanager - async def get_connection(self): - """Connection pool manager""" - async with self.connection_semaphore: - task_id = id(asyncio.current_task()) - try: - async with self.pool_lock: - if task_id not in self.connection_pool: - conn = await aiosqlite.connect( - self.db_path, - timeout=30.0 - ) - await conn.execute('PRAGMA journal_mode = WAL') - await conn.execute('PRAGMA busy_timeout = 5000') - self.connection_pool[task_id] = conn - - yield self.connection_pool[task_id] - - except Exception as e: - logger.error(f"Connection error: {e}") - raise - finally: - async with self.pool_lock: - if task_id in self.connection_pool: - await self.connection_pool[task_id].close() - del self.connection_pool[task_id] - - async def execute_with_retry(self, operation, *args): - """Execute database operations with retry logic""" - for attempt in range(self.max_retries): - try: - async with self.get_connection() as db: - result = await operation(db, *args) - await db.commit() - return result - except Exception as e: - if attempt == self.max_retries - 1: - logger.error(f"Operation failed after {self.max_retries} attempts: {e}") - raise - await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff - - async def ainit_db(self): - """Initialize database schema""" - async def _init(db): - await db.execute(''' - CREATE TABLE IF NOT EXISTS crawled_data ( - url TEXT PRIMARY KEY, - html TEXT, - cleaned_html TEXT, - markdown TEXT, - extracted_content TEXT, - success BOOLEAN, - media TEXT DEFAULT "{}", - links TEXT DEFAULT "{}", - metadata TEXT DEFAULT "{}", - screenshot TEXT DEFAULT "", - response_headers TEXT DEFAULT "{}", - downloaded_files TEXT DEFAULT "{}" -- New column added - ) - ''') - - await self.execute_with_retry(_init) - await self.update_db_schema() - - async def update_db_schema(self): - """Update database schema if needed""" - async def _check_columns(db): - cursor = await db.execute("PRAGMA table_info(crawled_data)") - columns = await cursor.fetchall() - return [column[1] for column in columns] - - column_names = await self.execute_with_retry(_check_columns) - - # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] - - for column in new_columns: - if column not in column_names: - await self.aalter_db_add_column(column) - - async def aalter_db_add_column(self, new_column: str): - """Add new column to the database""" - async def _alter(db): - if new_column == 'response_headers': - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') - else: - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') - logger.info(f"Added column '{new_column}' to the database.") - - await self.execute_with_retry(_alter) - - - async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]: - """Retrieve cached URL data""" - async def _get(db): - async with db.execute( - ''' - SELECT url, html, cleaned_html, markdown, - extracted_content, success, media, links, - metadata, screenshot, response_headers, - downloaded_files - FROM crawled_data WHERE url = ? - ''', - (url,) - ) as cursor: - row = await cursor.fetchone() - if row: - # Load content from files using stored hashes - html = await self._load_content(row[1], 'html') if row[1] else "" - cleaned = await self._load_content(row[2], 'cleaned') if row[2] else "" - markdown = await self._load_content(row[3], 'markdown') if row[3] else "" - extracted = await self._load_content(row[4], 'extracted') if row[4] else "" - screenshot = await self._load_content(row[9], 'screenshots') if row[9] else "" - - return ( - row[0], # url - html or "", # Return empty string if file not found - cleaned or "", - markdown or "", - extracted or "", - row[5], # success - json.loads(row[6] or '{}'), # media - json.loads(row[7] or '{}'), # links - json.loads(row[8] or '{}'), # metadata - screenshot or "", - json.loads(row[10] or '{}'), # response_headers - json.loads(row[11] or '[]') # downloaded_files - ) - return None - - try: - return await self.execute_with_retry(_get) - except Exception as e: - logger.error(f"Error retrieving cached URL: {e}") - return None - - async def acache_url(self, url: str, html: str, cleaned_html: str, - markdown: str, extracted_content: str, success: bool, - media: str = "{}", links: str = "{}", - metadata: str = "{}", screenshot: str = "", - response_headers: str = "{}", downloaded_files: str = "[]"): - """Cache URL data with content stored in filesystem""" - - # Store content files and get hashes - html_hash = await self._store_content(html, 'html') - cleaned_hash = await self._store_content(cleaned_html, 'cleaned') - markdown_hash = await self._store_content(markdown, 'markdown') - extracted_hash = await self._store_content(extracted_content, 'extracted') - screenshot_hash = await self._store_content(screenshot, 'screenshots') - - async def _cache(db): - await db.execute(''' - INSERT INTO crawled_data ( - url, html, cleaned_html, markdown, - extracted_content, success, media, links, metadata, - screenshot, response_headers, downloaded_files - ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ON CONFLICT(url) DO UPDATE SET - html = excluded.html, - cleaned_html = excluded.cleaned_html, - markdown = excluded.markdown, - extracted_content = excluded.extracted_content, - success = excluded.success, - media = excluded.media, - links = excluded.links, - metadata = excluded.metadata, - screenshot = excluded.screenshot, - response_headers = excluded.response_headers, - downloaded_files = excluded.downloaded_files - ''', (url, html_hash, cleaned_hash, markdown_hash, extracted_hash, - success, media, links, metadata, screenshot_hash, - response_headers, downloaded_files)) - - try: - await self.execute_with_retry(_cache) - except Exception as e: - logger.error(f"Error caching URL: {e}") - - - - async def aget_total_count(self) -> int: - """Get total number of cached URLs""" - async def _count(db): - async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor: - result = await cursor.fetchone() - return result[0] if result else 0 - - try: - return await self.execute_with_retry(_count) - except Exception as e: - logger.error(f"Error getting total count: {e}") - return 0 - - async def aclear_db(self): - """Clear all data from the database""" - async def _clear(db): - await db.execute('DELETE FROM crawled_data') - - try: - await self.execute_with_retry(_clear) - except Exception as e: - logger.error(f"Error clearing database: {e}") - - async def aflush_db(self): - """Drop the entire table""" - async def _flush(db): - await db.execute('DROP TABLE IF EXISTS crawled_data') - - try: - await self.execute_with_retry(_flush) - except Exception as e: - logger.error(f"Error flushing database: {e}") - - - async def _store_content(self, content: str, content_type: str) -> str: - """Store content in filesystem and return hash""" - if not content: - return "" - - content_hash = generate_content_hash(content) - file_path = os.path.join(self.content_paths[content_type], content_hash) - - # Only write if file doesn't exist - if not os.path.exists(file_path): - async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: - await f.write(content) - - return content_hash - - async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: - """Load content from filesystem by hash""" - if not content_hash: - return None - - file_path = os.path.join(self.content_paths[content_type], content_hash) - try: - async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: - return await f.read() - except: - logger.error(f"Failed to load content: {file_path}") - return None - -# Create a singleton instance -async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_webcrawler.3.73.py b/crawl4ai/async_webcrawler.3.73.py deleted file mode 100644 index 03e7a393..00000000 --- a/crawl4ai/async_webcrawler.3.73.py +++ /dev/null @@ -1,344 +0,0 @@ -import os -import time -from pathlib import Path -from typing import Optional -import json -import asyncio -from .models import CrawlResult -from .async_database import async_db_manager -from .chunking_strategy import * -from .extraction_strategy import * -from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse -from .content_scrapping_strategy import WebScrapingStrategy -from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD -from .utils import ( - sanitize_input_encode, - InvalidCSSSelectorError, - format_html -) -from .__version__ import __version__ as crawl4ai_version - -class AsyncWebCrawler: - def __init__( - self, - crawler_strategy: Optional[AsyncCrawlerStrategy] = None, - always_by_pass_cache: bool = False, - base_directory: str = str(Path.home()), - **kwargs, - ): - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - **kwargs - ) - self.always_by_pass_cache = always_by_pass_cache - # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") - self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") - os.makedirs(self.crawl4ai_folder, exist_ok=True) - os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) - self.ready = False - self.verbose = kwargs.get("verbose", False) - - async def __aenter__(self): - await self.crawler_strategy.__aenter__() - await self.awarmup() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) - - async def awarmup(self): - # Print a message for crawl4ai and its version - if self.verbose: - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") - print("[LOG] 🌤️ Warming up the AsyncWebCrawler") - # await async_db_manager.ainit_db() - # # await async_db_manager.initialize() - # await self.arun( - # url="https://google.com/", - # word_count_threshold=5, - # bypass_cache=False, - # verbose=False, - # ) - self.ready = True - if self.verbose: - print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") - - async def arun( - self, - url: str, - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - user_agent: str = None, - verbose=True, - disable_cache: bool = False, - no_cache_read: bool = False, - no_cache_write: bool = False, - **kwargs, - ) -> CrawlResult: - """ - Runs the crawler for a single source: URL (web, local file, or raw HTML). - - Args: - url (str): The URL to crawl. Supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] - - Returns: - CrawlResult: The result of the crawling and processing. - """ - try: - if disable_cache: - bypass_cache = True - no_cache_read = True - no_cache_write = True - - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) - - async_response: AsyncCrawlResponse = None - cached = None - screenshot_data = None - extracted_content = None - - is_web_url = url.startswith(('http://', 'https://')) - is_local_file = url.startswith("file://") - is_raw_html = url.startswith("raw:") - _url = url if not is_raw_html else "Raw HTML" - - start_time = time.perf_counter() - cached_result = None - if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache: - cached_result = await async_db_manager.aget_cached_url(url) - - if cached_result: - html = sanitize_input_encode(cached_result.html) - extracted_content = sanitize_input_encode(cached_result.extracted_content or "") - if screenshot: - screenshot_data = cached_result.screenshot - if not screenshot_data: - cached_result = None - if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) - - - if not cached or not html: - t1 = time.perf_counter() - - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) - html = sanitize_input_encode(async_response.html) - screenshot_data = async_response.screenshot - t2 = time.perf_counter() - if verbose: - print( - f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" - ) - - t1 = time.perf_counter() - crawl_result = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - css_selector=css_selector, - screenshot=screenshot_data, - verbose=verbose, - is_cached=bool(cached), - async_response=async_response, - bypass_cache=bypass_cache, - is_web_url = is_web_url, - is_local_file = is_local_file, - is_raw_html = is_raw_html, - **kwargs, - ) - - if async_response: - crawl_result.status_code = async_response.status_code - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files - else: - crawl_result.status_code = 200 - crawl_result.response_headers = cached_result.response_headers if cached_result else {} - - crawl_result.success = bool(html) - crawl_result.session_id = kwargs.get("session_id", None) - - if verbose: - print( - f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds" - ) - - if not is_raw_html and not no_cache_write: - if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url(crawl_result) - - - return crawl_result - - except Exception as e: - if not hasattr(e, "msg"): - e.msg = str(e) - print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}") - return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg) - - async def arun_many( - self, - urls: List[str], - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - user_agent: str = None, - verbose=True, - **kwargs, - ) -> List[CrawlResult]: - """ - Runs the crawler for multiple sources: URLs (web, local files, or raw HTML). - - Args: - urls (List[str]): A list of URLs with supported prefixes: - - 'http://' or 'https://': Web URL to crawl. - - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. - ... [other existing parameters] - - Returns: - List[CrawlResult]: The results of the crawling and processing. - """ - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.arun( - url, - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - bypass_cache=bypass_cache, - css_selector=css_selector, - screenshot=screenshot, - user_agent=user_agent, - verbose=verbose, - **kwargs, - ) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [result if not isinstance(result, Exception) else str(result) for result in results] - - async def aprocess_html( - self, - url: str, - html: str, - extracted_content: str, - word_count_threshold: int, - extraction_strategy: ExtractionStrategy, - chunking_strategy: ChunkingStrategy, - css_selector: str, - screenshot: str, - verbose: bool, - **kwargs, - ) -> CrawlResult: - t = time.perf_counter() - # Extract content from HTML - try: - _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" - t1 = time.perf_counter() - scrapping_strategy = WebScrapingStrategy() - # result = await scrapping_strategy.ascrap( - result = scrapping_strategy.scrap( - url, - html, - word_count_threshold=word_count_threshold, - css_selector=css_selector, - only_text=kwargs.get("only_text", False), - image_description_min_word_threshold=kwargs.get( - "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD - ), - **kwargs, - ) - - if result is None: - raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") - except InvalidCSSSelectorError as e: - raise ValueError(str(e)) - except Exception as e: - raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") - - cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) - markdown = sanitize_input_encode(result.get("markdown", "")) - fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) - fit_html = sanitize_input_encode(result.get("fit_html", "")) - media = result.get("media", []) - links = result.get("links", []) - metadata = result.get("metadata", {}) - - if verbose: - print( - f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds" - ) - - if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): - t1 = time.perf_counter() - # Check if extraction strategy is type of JsonCssExtractionStrategy - if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): - extraction_strategy.verbose = verbose - extracted_content = extraction_strategy.run(url, [html]) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - else: - sections = chunking_strategy.chunk(markdown) - extracted_content = extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - if verbose: - print( - f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds" - ) - - screenshot = None if not screenshot else screenshot - - return CrawlResult( - url=url, - html=html, - cleaned_html=format_html(cleaned_html), - markdown=markdown, - fit_markdown=fit_markdown, - fit_html= fit_html, - media=media, - links=links, - metadata=metadata, - screenshot=screenshot, - extracted_content=extracted_content, - success=True, - error_message="", - ) - - async def aclear_cache(self): - # await async_db_manager.aclear_db() - await async_db_manager.cleanup() - - async def aflush_cache(self): - await async_db_manager.aflush_db() - - async def aget_cache_size(self): - return await async_db_manager.aget_total_count() - - diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 7d1814b6..2ff7ce0f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -7,14 +7,14 @@ from pathlib import Path from typing import Optional, List, Union import json import asyncio -from .models import CrawlResult +from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode -from .content_scrapping_strategy import WebScrapingStrategy +from .content_scraping_strategy import WebScrapingStrategy from .async_logger import AsyncLogger from .config import ( @@ -476,7 +476,7 @@ class AsyncWebCrawler: html, word_count_threshold=word_count_threshold, css_selector=css_selector, - only_text=kwargs.get("only_text", False), + only_text=kwargs.pop("only_text", False), image_description_min_word_threshold=kwargs.get( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), @@ -491,6 +491,8 @@ class AsyncWebCrawler: except Exception as e: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None) + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) markdown = sanitize_input_encode(result.get("markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) @@ -542,6 +544,7 @@ class AsyncWebCrawler: url=url, html=html, cleaned_html=format_html(cleaned_html), + markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, fit_html= fit_html, diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scraping_strategy.py similarity index 84% rename from crawl4ai/content_scrapping_strategy.py rename to crawl4ai/content_scraping_strategy.py index 0f470671..3823a78d 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1,6 +1,6 @@ import re # Point 1: Pre-Compile Regular Expressions from abc import ABC, abstractmethod -from typing import Dict, Any +from typing import Dict, Any, Optional from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor import asyncio, requests, re, os @@ -10,103 +10,19 @@ from urllib.parse import urljoin from requests.exceptions import InvalidSchema # from .content_cleaning_strategy import ContentCleaningStrategy from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter - +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy +from .models import MarkdownGenerationResult from .utils import ( sanitize_input_encode, sanitize_html, extract_metadata, InvalidCSSSelectorError, - # CustomHTML2Text, + CustomHTML2Text, normalize_url, is_external_url ) -from .html2text import HTML2Text -class CustomHTML2Text(HTML2Text): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.inside_pre = False - self.inside_code = False - self.preserve_tags = set() # Set of tags to preserve - self.current_preserved_tag = None - self.preserved_content = [] - self.preserve_depth = 0 - - # Configuration options - self.skip_internal_links = False - self.single_line_break = False - self.mark_code = False - self.include_sup_sub = False - self.body_width = 0 - self.ignore_mailto_links = True - self.ignore_links = False - self.escape_backslash = False - self.escape_dot = False - self.escape_plus = False - self.escape_dash = False - self.escape_snob = False - - def update_params(self, **kwargs): - """Update parameters and set preserved tags.""" - for key, value in kwargs.items(): - if key == 'preserve_tags': - self.preserve_tags = set(value) - else: - setattr(self, key, value) - - def handle_tag(self, tag, attrs, start): - # Handle preserved tags - if tag in self.preserve_tags: - if start: - if self.preserve_depth == 0: - self.current_preserved_tag = tag - self.preserved_content = [] - # Format opening tag with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - self.preserve_depth += 1 - return - else: - self.preserve_depth -= 1 - if self.preserve_depth == 0: - self.preserved_content.append(f'') - # Output the preserved HTML block with proper spacing - preserved_html = ''.join(self.preserved_content) - self.o('\n' + preserved_html + '\n') - self.current_preserved_tag = None - return - - # If we're inside a preserved tag, collect all content - if self.preserve_depth > 0: - if start: - # Format nested tags with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - else: - self.preserved_content.append(f'') - return - - # Handle pre tags - if tag == 'pre': - if start: - self.o('```\n') - self.inside_pre = True - else: - self.o('\n```') - self.inside_pre = False - # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: - # pass - else: - super().handle_tag(tag, attrs, start) - - def handle_data(self, data, entity_char=False): - """Override handle_data to capture content within preserved tags.""" - if self.preserve_depth > 0: - self.preserved_content.append(data) - return - super().handle_data(data, entity_char) - # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r'^og:') TWITTER_REGEX = re.compile(r'^twitter:') @@ -164,6 +80,98 @@ class WebScrapingStrategy(ContentScrapingStrategy): async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs) + + def _generate_markdown_content(self, + cleaned_html: str, + html: str, + url: str, + success: bool, + **kwargs) -> Dict[str, Any]: + """Generate markdown content using either new strategy or legacy method. + + Args: + cleaned_html: Sanitized HTML content + html: Original HTML content + url: Base URL of the page + success: Whether scraping was successful + **kwargs: Additional options including: + - markdown_generator: Optional[MarkdownGenerationStrategy] + - html2text: Dict[str, Any] options for HTML2Text + - content_filter: Optional[RelevantContentFilter] + - fit_markdown: bool + - fit_markdown_user_query: Optional[str] + - fit_markdown_bm25_threshold: float + + Returns: + Dict containing markdown content in various formats + """ + markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy()) + + if markdown_generator: + try: + markdown_result = markdown_generator.generate_markdown( + cleaned_html=cleaned_html, + base_url=url, + html2text_options=kwargs.get('html2text', {}), + content_filter=kwargs.get('content_filter', None) + ) + + markdown_v2 = MarkdownGenerationResult( + raw_markdown=markdown_result.raw_markdown, + markdown_with_citations=markdown_result.markdown_with_citations, + references_markdown=markdown_result.references_markdown, + fit_markdown=markdown_result.fit_markdown + ) + + return { + 'markdown': markdown_result.raw_markdown, + 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': markdown_v2 + } + except Exception as e: + self._log('error', + message="Error using new markdown generation strategy: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + markdown_generator = None + + # Legacy method + h = CustomHTML2Text() + h.update_params(**kwargs.get('html2text', {})) + markdown = h.handle(cleaned_html) + markdown = markdown.replace(' ```', '```') + + fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." + fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." + + if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): + content_filter = kwargs.get('content_filter', None) + if not content_filter: + content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + fit_html = content_filter.filter_content(html) + fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) + fit_markdown = h.handle(fit_html) + + markdown_v2 = MarkdownGenerationResult( + raw_markdown=markdown, + markdown_with_citations=markdown, + references_markdown=markdown, + fit_markdown=fit_markdown + ) + + return { + 'markdown': markdown, + 'fit_markdown': fit_markdown, + 'fit_html': fit_html, + 'markdown_v2' : markdown_v2 + } + + def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: success = True if not html: @@ -242,8 +250,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): #Score an image for it's usefulness def score_image_for_usefulness(img, base_url, index, images_count): - - image_height = img.get('height') height_value, height_unit = parse_dimension(image_height) image_width = img.get('width') @@ -282,7 +288,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): if not is_valid_image(img, img.parent, img.parent.get('class', [])): return None score = score_image_for_usefulness(img, url, index, total_images) - if score <= IMAGE_SCORE_THRESHOLD: + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): return None return { 'src': img.get('src', ''), @@ -545,41 +551,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') - try: - h = CustomHTML2Text() - h.update_params(**kwargs.get('html2text', {})) - markdown = h.handle(cleaned_html) - except Exception as e: - if not h: - h = CustomHTML2Text() - self._log('error', - message="Error converting HTML to markdown: {error}", - tag="SCRAPE", - params={"error": str(e)} - ) - markdown = h.handle(sanitize_html(cleaned_html)) - markdown = markdown.replace(' ```', '```') - + markdown_content = self._generate_markdown_content( + cleaned_html=cleaned_html, + html=html, + url=url, + success=success, + **kwargs + ) - - fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." - fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." - if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): - content_filter = kwargs.get('content_filter', None) - if not content_filter: - content_filter = BM25ContentFilter( - user_query= kwargs.get('fit_markdown_user_query', None), - bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) - ) - fit_html = content_filter.filter_content(html) - fit_html = '\n'.join('
    {}
    '.format(s) for s in fit_html) - fit_markdown = h.handle(fit_html) - - cleaned_html = sanitize_html(cleaned_html) return { - 'markdown': markdown, - 'fit_markdown': fit_markdown, - 'fit_html': fit_html, + **markdown_content, 'cleaned_html': cleaned_html, 'success': success, 'media': media, diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py new file mode 100644 index 00000000..1adb4c28 --- /dev/null +++ b/crawl4ai/markdown_generation_strategy.py @@ -0,0 +1,115 @@ +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any, Tuple +from .models import MarkdownGenerationResult +from .utils import CustomHTML2Text +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter +import re +from urllib.parse import urljoin + +# Pre-compile the regex pattern +LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') + +class MarkdownGenerationStrategy(ABC): + """Abstract base class for markdown generation strategies.""" + + @abstractmethod + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """Generate markdown from cleaned HTML.""" + pass + +class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): + """Default implementation of markdown generation strategy.""" + + def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: + link_map = {} + url_cache = {} # Cache for URL joins + parts = [] + last_end = 0 + counter = 1 + + for match in LINK_PATTERN.finditer(markdown): + parts.append(markdown[last_end:match.start()]) + text, url, title = match.groups() + + # Use cached URL if available, otherwise compute and cache + if base_url and not url.startswith(('http://', 'https://', 'mailto:')): + if url not in url_cache: + url_cache[url] = fast_urljoin(base_url, url) + url = url_cache[url] + + if url not in link_map: + desc = [] + if title: desc.append(title) + if text and text != title: desc.append(text) + link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") + counter += 1 + + num = link_map[url][0] + parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") + last_end = match.end() + + parts.append(markdown[last_end:]) + converted_text = ''.join(parts) + + # Pre-build reference strings + references = ["\n\n## References\n\n"] + references.extend( + f"⟨{num}⟩ {url}{desc}\n" + for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) + ) + + return converted_text, ''.join(references) + + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """Generate markdown with citations from cleaned HTML.""" + # Initialize HTML2Text with options + h = CustomHTML2Text() + if html2text_options: + h.update_params(**html2text_options) + + # Generate raw markdown + raw_markdown = h.handle(cleaned_html) + raw_markdown = raw_markdown.replace(' ```', '```') + + # Convert links to citations + if citations: + markdown_with_citations, references_markdown = self.convert_links_to_citations( + raw_markdown, base_url + ) + + # Generate fit markdown if content filter is provided + fit_markdown: Optional[str] = None + if content_filter: + filtered_html = content_filter.filter_content(cleaned_html) + filtered_html = '\n'.join('
    {}
    '.format(s) for s in filtered_html) + fit_markdown = h.handle(filtered_html) + + return MarkdownGenerationResult( + raw_markdown=raw_markdown, + markdown_with_citations=markdown_with_citations, + references_markdown=references_markdown, + fit_markdown=fit_markdown + ) + +def fast_urljoin(base: str, url: str) -> str: + """Fast URL joining for common cases.""" + if url.startswith(('http://', 'https://', 'mailto:', '//')): + return url + if url.startswith('/'): + # Handle absolute paths + if base.endswith('/'): + return base[:-1] + url + return base + url + return urljoin(base, url) \ No newline at end of file diff --git a/crawl4ai/models.py b/crawl4ai/models.py index cab4c45b..122434ad 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional, Callable, Awaitable +from typing import List, Dict, Optional, Callable, Awaitable, Union @@ -7,6 +7,12 @@ class UrlModel(BaseModel): url: HttpUrl forced: bool = False +class MarkdownGenerationResult(BaseModel): + raw_markdown: str + markdown_with_citations: str + references_markdown: str + fit_markdown: Optional[str] = None + class CrawlResult(BaseModel): url: str html: str @@ -16,7 +22,8 @@ class CrawlResult(BaseModel): links: Dict[str, List[Dict]] = {} downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None - markdown: Optional[str] = None + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + markdown_v2: Optional[MarkdownGenerationResult] = None fit_markdown: Optional[str] = None fit_html: Optional[str] = None extracted_content: Optional[str] = None @@ -36,3 +43,5 @@ class AsyncCrawlResponse(BaseModel): class Config: arbitrary_types_allowed = True + + diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 9abc5784..b07562df 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -18,6 +18,94 @@ import hashlib from typing import Optional, Tuple, Dict, Any import xxhash + +from .html2text import HTML2Text +class CustomHTML2Text(HTML2Text): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.inside_pre = False + self.inside_code = False + self.preserve_tags = set() # Set of tags to preserve + self.current_preserved_tag = None + self.preserved_content = [] + self.preserve_depth = 0 + + # Configuration options + self.skip_internal_links = False + self.single_line_break = False + self.mark_code = False + self.include_sup_sub = False + self.body_width = 0 + self.ignore_mailto_links = True + self.ignore_links = False + self.escape_backslash = False + self.escape_dot = False + self.escape_plus = False + self.escape_dash = False + self.escape_snob = False + + def update_params(self, **kwargs): + """Update parameters and set preserved tags.""" + for key, value in kwargs.items(): + if key == 'preserve_tags': + self.preserve_tags = set(value) + else: + setattr(self, key, value) + + def handle_tag(self, tag, attrs, start): + # Handle preserved tags + if tag in self.preserve_tags: + if start: + if self.preserve_depth == 0: + self.current_preserved_tag = tag + self.preserved_content = [] + # Format opening tag with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + self.preserve_depth += 1 + return + else: + self.preserve_depth -= 1 + if self.preserve_depth == 0: + self.preserved_content.append(f'') + # Output the preserved HTML block with proper spacing + preserved_html = ''.join(self.preserved_content) + self.o('\n' + preserved_html + '\n') + self.current_preserved_tag = None + return + + # If we're inside a preserved tag, collect all content + if self.preserve_depth > 0: + if start: + # Format nested tags with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + else: + self.preserved_content.append(f'') + return + + # Handle pre tags + if tag == 'pre': + if start: + self.o('```\n') + self.inside_pre = True + else: + self.o('\n```') + self.inside_pre = False + # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: + # pass + else: + super().handle_tag(tag, attrs, start) + + def handle_data(self, data, entity_char=False): + """Override handle_data to capture content within preserved tags.""" + if self.preserve_depth > 0: + self.preserved_content.append(data) + return + super().handle_data(data, entity_char) + + + class InvalidCSSSelectorError(Exception): pass diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 6cfef6f0..a32a988d 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,7 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor -from .content_scrapping_strategy import WebScrapingStrategy +from .content_scraping_strategy import WebScrapingStrategy from .config import * import warnings import json diff --git a/tests/async/test_content_scraper_strategy.py b/tests/async/test_content_scraper_strategy.py index 5dfa6362..62c49148 100644 --- a/tests/async/test_content_scraper_strategy.py +++ b/tests/async/test_content_scraper_strategy.py @@ -13,8 +13,8 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__f sys.path.append(parent_dir) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -from crawl4ai.content_scrapping_strategy import WebScrapingStrategy -from crawl4ai.content_scrapping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent +from crawl4ai.content_scraping_strategy import WebScrapingStrategy +from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent # from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent @dataclass diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py new file mode 100644 index 00000000..025a0318 --- /dev/null +++ b/tests/async/test_markdown_genertor.py @@ -0,0 +1,165 @@ +# ## Issue #236 +# - **Last Updated:** 2024-11-11 01:42:14 +# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236) +# - **State:** open + +import os, sys, time +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) +import asyncio +import os +import time +from typing import Dict, Any +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy + +# Get current directory +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +def print_test_result(name: str, result: Dict[str, Any], execution_time: float): + """Helper function to print test results.""" + print(f"\n{'='*20} {name} {'='*20}") + print(f"Execution time: {execution_time:.4f} seconds") + + + # Save markdown to files + for key, content in result.items(): + if isinstance(content, str): + with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f: + f.write(content) + + # # Print first few lines of each markdown version + # for key, content in result.items(): + # if isinstance(content, str): + # preview = '\n'.join(content.split('\n')[:3]) + # print(f"\n{key} (first 3 lines):") + # print(preview) + # print(f"Total length: {len(content)} characters") + +def test_basic_markdown_conversion(): + """Test basic markdown conversion with links.""" + with open(__location__ + "/data/wikipedia.html", "r") as f: + cleaned_html = f.read() + + generator = DefaultMarkdownGenerationStrategy() + + start_time = time.perf_counter() + result = generator.generate_markdown( + cleaned_html=cleaned_html, + base_url="https://en.wikipedia.org" + ) + execution_time = time.perf_counter() - start_time + + print_test_result("Basic Markdown Conversion", { + 'raw': result.raw_markdown, + 'with_citations': result.markdown_with_citations, + 'references': result.references_markdown + }, execution_time) + + # Basic assertions + assert result.raw_markdown, "Raw markdown should not be empty" + assert result.markdown_with_citations, "Markdown with citations should not be empty" + assert result.references_markdown, "References should not be empty" + assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets" + assert "## References" in result.references_markdown, "Should contain references section" + +def test_relative_links(): + """Test handling of relative links with base URL.""" + markdown = """ + Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com). + Also an [image](/images/test.png) and another [page](/wiki/Banana). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://en.wikipedia.org" + ) + + assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown + assert "https://example.com" in result.references_markdown + assert "https://en.wikipedia.org/images/test.png" in result.references_markdown + +def test_duplicate_links(): + """Test handling of duplicate links.""" + markdown = """ + Here's a [link](/test) and another [link](/test) and a [different link](/other). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + # Count citations in markdown + citations = result.markdown_with_citations.count("⟨1⟩") + assert citations == 2, "Same link should use same citation number" + +def test_link_descriptions(): + """Test handling of link titles and descriptions.""" + markdown = """ + Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + assert "Test Title" in result.references_markdown, "Link title should be in references" + assert "link with description" in result.references_markdown, "Link text should be in references" + +def test_performance_large_document(): + """Test performance with large document.""" + with open(__location__ + "/data/wikipedia.md", "r") as f: + markdown = f.read() + + # Test with multiple iterations + iterations = 5 + times = [] + + generator = DefaultMarkdownGenerationStrategy() + + for i in range(iterations): + start_time = time.perf_counter() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://en.wikipedia.org" + ) + end_time = time.perf_counter() + times.append(end_time - start_time) + + avg_time = sum(times) / len(times) + print(f"\n{'='*20} Performance Test {'='*20}") + print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds") + print(f"Min time: {min(times):.4f} seconds") + print(f"Max time: {max(times):.4f} seconds") + +def test_image_links(): + """Test handling of image links.""" + markdown = """ + Here's an ![image](/image.png "Image Title") and another ![image](/other.jpg). + And a regular [link](/page). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved" + assert "Image Title" in result.references_markdown, "Image title should be in references" + +if __name__ == "__main__": + print("Running markdown generation strategy tests...") + + test_basic_markdown_conversion() + test_relative_links() + test_duplicate_links() + test_link_descriptions() + test_performance_large_document() + test_image_links() + \ No newline at end of file From 006bee4a5a50fed10496b701ecfea350be1b7888 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 16:00:17 +0800 Subject: [PATCH 45/50] feat: enhance image processing capabilities - Enhanced image processing with srcset support and validation checks for better image selection. --- README.md | 4 +- crawl4ai/content_scraping_strategy.py | 145 ++++++++++++++++++++++++-- crawl4ai/tools.py | 34 ++++++ 3 files changed, 172 insertions(+), 11 deletions(-) create mode 100644 crawl4ai/tools.py diff --git a/README.md b/README.md index af0d6610..1d3063c7 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,10 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster, often completing in under 100 milliseconds (excluding web fetch time)! +- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster! - 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. +- 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats. +- 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models. - 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. - 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3823a78d..3b41ec82 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -19,9 +19,9 @@ from .utils import ( InvalidCSSSelectorError, CustomHTML2Text, normalize_url, - is_external_url - + is_external_url ) +from .tools import profile_and_time # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r'^og:') @@ -234,7 +234,26 @@ class WebScrapingStrategy(ContentScrapingStrategy): return text_content return None - def process_image(img, url, index, total_images): + def process_image_old(img, url, index, total_images): + def parse_srcset(srcset_str): + """Parse srcset attribute into list of image URLs with their sizes.""" + if not srcset_str: + return [] + + sources = [] + # Split on http/https and filter empty strings + urls = [f"http{part}" for part in srcset_str.split("http") if part] + + for url in urls: + # Remove trailing comma and whitespace, then split to get width + url = url.strip().rstrip(',') + parts = url.rsplit(' ', 1) + img_url = parts[0].strip() + width = parts[1].rstrip('w') if len(parts) > 1 else None + sources.append({'url': img_url, 'width': width}) + + return sources + #Check if an image has valid display and inside undesired html elements def is_valid_image(img, parent, parent_classes): style = img.get('style', '') @@ -283,14 +302,14 @@ class WebScrapingStrategy(ContentScrapingStrategy): score+=1 return score - - if not is_valid_image(img, img.parent, img.parent.get('class', [])): return None + score = score_image_for_usefulness(img, url, index, total_images) if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): return None - return { + + base_result = { 'src': img.get('src', ''), 'data-src': img.get('data-src', ''), 'alt': img.get('alt', ''), @@ -299,6 +318,109 @@ class WebScrapingStrategy(ContentScrapingStrategy): 'type': 'image' } + sources = [] + srcset = img.get('srcset', '') + if srcset: + sources = parse_srcset(srcset) + if sources: + return [dict(base_result, src=source['url'], width=source['width']) + for source in sources] + + return [base_result] # Always return a list + + def process_image(img, url, index, total_images): + parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') + if ' ' in u else None} + for u in [f"http{p}" for p in s.split("http") if p]] + + # Constants for checks + classes_to_check = frozenset(['button', 'icon', 'logo']) + tags_to_check = frozenset(['button', 'input']) + + # Pre-fetch commonly used attributes + style = img.get('style', '') + alt = img.get('alt', '') + src = img.get('src', '') + data_src = img.get('data-src', '') + width = img.get('width') + height = img.get('height') + parent = img.parent + parent_classes = parent.get('class', []) + + # Quick validation checks + if ('display:none' in style or + parent.name in tags_to_check or + any(c in cls for c in parent_classes for cls in classes_to_check) or + any(c in src for c in classes_to_check) or + any(c in alt for c in classes_to_check)): + return None + + # Quick score calculation + score = 0 + if width and width.isdigit(): + width_val = int(width) + score += 1 if width_val > 150 else 0 + if height and height.isdigit(): + height_val = int(height) + score += 1 if height_val > 150 else 0 + if alt: + score += 1 + score += index/total_images < 0.5 + + image_format = '' + if "data:image/" in src: + image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] + else: + image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] + + if image_format in ('jpg', 'png', 'webp', 'avif'): + score += 1 + + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): + return None + + # Use set for deduplication + unique_urls = set() + image_variants = [] + + # Base image info template + base_info = { + 'alt': alt, + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } + + # Inline function for adding variants + def add_variant(src, width=None): + if src and not src.startswith('data:') and src not in unique_urls: + unique_urls.add(src) + image_variants.append({**base_info, 'src': src, 'width': width}) + + # Process all sources + add_variant(src) + add_variant(data_src) + + # Handle srcset and data-srcset in one pass + for attr in ('srcset', 'data-srcset'): + if value := img.get(attr): + for source in parse_srcset(value): + add_variant(source['url'], source['width']) + + # Quick picture element check + if picture := img.find_parent('picture'): + for source in picture.find_all('source'): + if srcset := source.get('srcset'): + for src in parse_srcset(srcset): + add_variant(src['url'], src['width']) + + # Framework-specific attributes in one pass + for attr, value in img.attrs.items(): + if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value: + add_variant(value) + + return image_variants if image_variants else None + def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False): attrs_to_remove = [] for attr in element.attrs: @@ -490,13 +612,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): links['internal'] = list(internal_links_dict.values()) links['external'] = list(external_links_dict.values()) - # # Process images using ThreadPoolExecutor imgs = body.find_all('img') - with ThreadPoolExecutor() as executor: - image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs))) - media['images'] = [result for result in image_results if result is not None] + # For test we use for loop instead of thread + media['images'] = [ + img for result in (process_image(img, url, i, len(imgs)) + for i, img in enumerate(imgs)) + if result is not None + for img in result + ] def flatten_nested_elements(node): if isinstance(node, NavigableString): diff --git a/crawl4ai/tools.py b/crawl4ai/tools.py new file mode 100644 index 00000000..ff36b53a --- /dev/null +++ b/crawl4ai/tools.py @@ -0,0 +1,34 @@ +import time +import cProfile +import pstats +from functools import wraps + +def profile_and_time(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + # Start timer + start_time = time.perf_counter() + + # Setup profiler + profiler = cProfile.Profile() + profiler.enable() + + # Run function + result = func(self, *args, **kwargs) + + # Stop profiler + profiler.disable() + + # Calculate elapsed time + elapsed_time = time.perf_counter() - start_time + + # Print timing + print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds") + + # Print profiling stats + stats = pstats.Stats(profiler) + stats.sort_stats('cumulative') # Sort by cumulative time + stats.print_stats(20) # Print top 20 time-consuming functions + + return result + return wrapper \ No newline at end of file From 571dda6549da3c31a5f7566359585eefe9ad2867 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:27:43 +0800 Subject: [PATCH 46/50] Update Redme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1d3063c7..e3ced79e 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc - 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats. - 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models. - 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. +- 🖼️ **Enhanced Image Extraction:** Supports srcset, picture elements, and responsive images. - 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. - 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. - ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. From 24ad2fe2ddc11250bdd90d42c127a85cbfdb8fd5 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:47:17 +0800 Subject: [PATCH 47/50] feat: enhance Markdown generation to include fit_html attribute --- crawl4ai/content_scraping_strategy.py | 13 +++---------- crawl4ai/markdown_generation_strategy.py | 3 ++- crawl4ai/models.py | 1 + 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3b41ec82..d4b901d2 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -109,25 +109,18 @@ class WebScrapingStrategy(ContentScrapingStrategy): if markdown_generator: try: - markdown_result = markdown_generator.generate_markdown( + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, html2text_options=kwargs.get('html2text', {}), content_filter=kwargs.get('content_filter', None) ) - markdown_v2 = MarkdownGenerationResult( - raw_markdown=markdown_result.raw_markdown, - markdown_with_citations=markdown_result.markdown_with_citations, - references_markdown=markdown_result.references_markdown, - fit_markdown=markdown_result.fit_markdown - ) - return { 'markdown': markdown_result.raw_markdown, 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'markdown_v2': markdown_v2 + 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': markdown_result } except Exception as e: self._log('error', diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 1adb4c28..7922c413 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -100,7 +100,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown=raw_markdown, markdown_with_citations=markdown_with_citations, references_markdown=references_markdown, - fit_markdown=fit_markdown + fit_markdown=fit_markdown, + fit_html=filtered_html ) def fast_urljoin(base: str, url: str) -> str: diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 122434ad..3a1b8bd1 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -12,6 +12,7 @@ class MarkdownGenerationResult(BaseModel): markdown_with_citations: str references_markdown: str fit_markdown: Optional[str] = None + fit_html: Optional[str] = None class CrawlResult(BaseModel): url: str From e02935dc5b1fee1734f12fb60145193c2b9f5645 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:49:22 +0800 Subject: [PATCH 48/50] chore: update README to reflect new features and improvements in version 0.3.74 --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index e3ced79e..b0f9fff9 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,18 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -- 🚀 **Blazing Fast Scraping:** The scraping process is now significantly faster! -- 📥 **Download Manager:** Integrated file crawling and downloading capabilities, with full control over file management and tracking within the `CrawlResult` object. -- 📝 **Markdown Generation Strategy:** Flexible markdown generation system supporting custom strategies for different use cases and output formats. -- 🔗 **LLM-Friendly Citations:** Automatic conversion of links into numbered citations with organized reference lists, making content more digestible for large language models. -- 🔎 **Markdown Filter:** Enhanced content extraction using BM25 algorithm to create cleaner markdown with only relevant webpage content. -- 🖼️ **Enhanced Image Extraction:** Supports srcset, picture elements, and responsive images. -- 🗂️ **Local & Raw HTML:** Crawl local files (`file://`) and raw HTML strings (`raw:`) directly. -- 🤖 **Browser Control:** Use your own browser setup for crawling, with persistent contexts and stealth integration to bypass anti-bot measures. -- ☁️ **API & Cache Boost:** CORS support, static file serving, and a new filesystem-based cache for blazing-fast performance. Fine-tune caching with the `CacheMode` enum (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS) and the `always_bypass_cache` parameter. -- 🐳 **API Gateway:** Run Crawl4AI as a local or cloud API service, enabling cross-platform usage through a containerized server with secure token authentication via `CRAWL4AI_API_TOKEN`. -- 🛠️ **Database Improvements:** Enhanced database system for handling larger content sets with improved caching and faster performance. -- 🐛 **Squashed Bugs:** Fixed browser context issues in Docker, memory leaks, enhanced error handling, and improved HTML parsing. +🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. +📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. +📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. +🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. +🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. +🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. +🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. +☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. +🐳 **API Gateway**: Run as an API service with secure token authentication. +🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. +🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. ## Try it Now! From 8dea3f470f5a496a30dada1eab1c3b23ee3560ca Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 18:50:12 +0800 Subject: [PATCH 49/50] chore: update README to include new features and improvements for version 0.3.74 --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index b0f9fff9..fa88a507 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,19 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc ## New in 0.3.74 ✨ -🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. -📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. -📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. -🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. -🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. -🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. -🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. -🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. -☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. -🐳 **API Gateway**: Run as an API service with secure token authentication. -🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. -🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. +- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. +- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. +- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. +- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. +- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. +- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. +- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. +- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. +- 🐳 **API Gateway**: Run as an API service with secure token authentication. +- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. +- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. + ## Try it Now! From 0d0cef343842af2aa369423790e159620e717f6c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 22 Nov 2024 20:14:58 +0800 Subject: [PATCH 50/50] feat: add enhanced markdown generation example with citations and file output --- docs/examples/v0.3.74.overview.py | 109 ++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 35 deletions(-) diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py index 00296740..362ae8fc 100644 --- a/docs/examples/v0.3.74.overview.py +++ b/docs/examples/v0.3.74.overview.py @@ -52,34 +52,7 @@ async def download_example(): else: print("\nNo files were downloaded") -# 2. Content Filtering with BM25 Example -async def content_filtering_example(): - """Example of using the new BM25 content filtering""" - async with AsyncWebCrawler(verbose=True) as crawler: - # Create filter with custom query for OpenAI's blog - content_filter = BM25ContentFilter( - # user_query="Investment and fundraising", - # user_query="Robotic", - bm25_threshold=1.0 - ) - - result = await crawler.arun( - url="https://techcrunch.com/", - content_filter=content_filter, - cache_mode=CacheMode.BYPASS - ) - - print(f"Filtered content: {len(result.fit_markdown)}") - print(f"Filtered content: {result.fit_markdown}") - - # Save html - with open(os.path.join(__data__, "techcrunch.html"), "w") as f: - f.write(result.fit_html) - - with open(os.path.join(__data__, "filtered_content.md"), "w") as f: - f.write(result.fit_markdown) - -# 3. Local File and Raw HTML Processing Example +# 2. Local File and Raw HTML Processing Example async def local_and_raw_html_example(): """Example of processing local files and raw HTML""" # Create a sample HTML file @@ -115,6 +88,68 @@ async def local_and_raw_html_example(): print("Local file content:", local_result.markdown) print("\nRaw HTML content:", raw_result.markdown) +# 3. Enhanced Markdown Generation Example +async def markdown_generation_example(): + """Example of enhanced markdown generation with citations and LLM-friendly features""" + async with AsyncWebCrawler(verbose=True) as crawler: + # Create a content filter (optional) + content_filter = BM25ContentFilter( + # user_query="History and cultivation", + bm25_threshold=1.0 + ) + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=content_filter, + cache_mode=CacheMode.BYPASS + ) + + from crawl4ai import AsyncWebCrawler + from crawl4ai.content_filter_strategy import BM25ContentFilter + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=BM25ContentFilter() + ) + print(result.markdown_v2.fit_markdown) + + print("\nMarkdown Generation Results:") + print(f"1. Original markdown length: {len(result.markdown)}") + print(f"2. New markdown versions (markdown_v2):") + print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}") + print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}") + print(f" - References section length: {len(result.markdown_v2.references_markdown)}") + if result.markdown_v2.fit_markdown: + print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}") + + # Save examples to files + output_dir = os.path.join(__data__, "markdown_examples") + os.makedirs(output_dir, exist_ok=True) + + # Save different versions + with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(output_dir, "3_references.md"), "w") as f: + f.write(result.markdown_v2.references_markdown) + + if result.markdown_v2.fit_markdown: + with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + + print(f"\nMarkdown examples saved to: {output_dir}") + + # Show a sample of citations and references + print("\nSample of markdown with citations:") + print(result.markdown_v2.markdown_with_citations[:500] + "...\n") + print("Sample of references:") + print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...") + # 4. Browser Management Example async def browser_management_example(): """Example of using enhanced browser management features""" @@ -208,9 +243,13 @@ async def api_example(): headers=headers ) as status_response: result = await status_response.json() - print(f"Task result: {result}") + print(f"Task status: {result['status']}") if result["status"] == "completed": + print("Task completed!") + print("Results:") + news = json.loads(result["results"][0]['extracted_content']) + print(json.dumps(news[:4], indent=2)) break else: await asyncio.sleep(1) @@ -220,15 +259,15 @@ async def main(): # print("Running Crawl4AI feature examples...") # print("\n1. Running Download Example:") - await download_example() + # await download_example() - # print("\n2. Running Content Filtering Example:") - await content_filtering_example() + # print("\n2. Running Markdown Generation Example:") + # await markdown_generation_example() - # print("\n3. Running Local and Raw HTML Example:") - await local_and_raw_html_example() + # # print("\n3. Running Local and Raw HTML Example:") + # await local_and_raw_html_example() - # print("\n4. Running Browser Management Example:") + # # print("\n4. Running Browser Management Example:") await browser_management_example() # print("\n5. Running API Example:")