diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index baa06e47..83933a35 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -14,6 +14,7 @@ from pydantic import BaseModel import hashlib import json import uuid +from .models import AsyncCrawlResponse from playwright_stealth import StealthConfig, stealth_async @@ -148,15 +149,6 @@ class ManagedBrowser: except Exception as e: print(f"Error removing temporary directory: {e}") -class AsyncCrawlResponse(BaseModel): - html: str - response_headers: Dict[str, str] - status_code: int - screenshot: Optional[str] = None - get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None - - class Config: - arbitrary_types_allowed = True class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -215,6 +207,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_retrieve_html': None } self.extra_args = kwargs.get("extra_args", []) + self.accept_downloads = kwargs.get("accept_downloads", False) + self.downloads_path = kwargs.get("downloads_path") + self._downloaded_files = [] # Track downloaded files for current crawl + if self.accept_downloads and not self.downloads_path: + self.downloads_path = os.path.join(os.getcwd(), "downloads") + os.makedirs(self.downloads_path, exist_ok=True) + async def __aenter__(self): await self.start() @@ -250,7 +249,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Set up the default context if self.default_context: await self.default_context.set_extra_http_headers(self.headers) - + if self.accept_downloads: + await self.default_context.set_default_timeout(60000) + await self.default_context.set_default_navigation_timeout(60000) + self.default_context._impl_obj._options["accept_downloads"] = True + self.default_context._impl_obj._options["downloads_path"] = self.downloads_path + if self.user_agent: await self.default_context.set_extra_http_headers({ "User-Agent": self.user_agent @@ -301,12 +305,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_persistent_context and self.user_data_dir: self.browser = await self.playwright.chromium.launch_persistent_context( user_data_dir=self.user_data_dir, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None, **browser_args ) self.default_context = self.browser else: self.browser = await self.playwright.chromium.launch(**browser_args) - + except Exception as e: # Fallback to chromium if Chrome channel fails if "chrome" in str(e) and browser_args.get("channel") == "chrome": @@ -565,6 +571,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers = {} status_code = None + # Reset downloaded files list for new crawl + self._downloaded_files = [] + self._cleanup_expired_sessions() session_id = kwargs.get("session_id") @@ -592,10 +601,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Normal context creation for non-persistent or non-Chrome browsers context = await self.browser.new_context( user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, + viewport={"width": 1200, "height": 800}, proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True + java_script_enabled=True, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None ) await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) await context.set_extra_http_headers(self.headers) @@ -655,6 +665,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) try: + # Set up download handling if enabled + if self.accept_downloads: + page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) + if self.verbose: print(f"[LOG] πŸ•ΈοΈ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") @@ -886,7 +900,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, - get_delayed_content=get_delayed_content + get_delayed_content=get_delayed_content, + downloaded_files=self._downloaded_files if self._downloaded_files else None ) return response except Error as e: @@ -896,6 +911,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # await page.close() # await context.close() + async def _handle_download(self, download): + """Handle file downloads.""" + try: + suggested_filename = download.suggested_filename + download_path = os.path.join(self.downloads_path, suggested_filename) + + if self.verbose: + print(f"[LOG] πŸ“₯ Downloading {suggested_filename} to {download_path}") + + await download.save_as(download_path) + self._downloaded_files.append(download_path) + + if self.verbose: + print(f"[LOG] βœ… Downloaded {suggested_filename} successfully") + except Exception as e: + if self.verbose: + print(f"[ERROR] Failed to handle download: {str(e)}") + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed semaphore = asyncio.Semaphore(semaphore_count) diff --git a/crawl4ai/async_crawler_strategy_0.3.73.py b/crawl4ai/async_crawler_strategy_0.3.73.py deleted file mode 100644 index 54835dad..00000000 --- a/crawl4ai/async_crawler_strategy_0.3.73.py +++ /dev/null @@ -1,965 +0,0 @@ -import asyncio -import base64 -import time -from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable -import os, sys, shutil -import tempfile, subprocess -from playwright.async_api import async_playwright, Page, Browser, Error -from io import BytesIO -from PIL import Image, ImageDraw, ImageFont -from pathlib import Path -from playwright.async_api import ProxySettings -from pydantic import BaseModel -import hashlib -import json -import uuid - -from playwright_stealth import StealthConfig, stealth_async - -stealth_config = StealthConfig( - webdriver=True, - chrome_app=True, - chrome_csi=True, - chrome_load_times=True, - chrome_runtime=True, - navigator_languages=True, - navigator_plugins=True, - navigator_permissions=True, - webgl_vendor=True, - outerdimensions=True, - navigator_hardware_concurrency=True, - media_codecs=True, -) - - -class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): - self.browser_type = browser_type - self.user_data_dir = user_data_dir - self.headless = headless - self.browser_process = None - self.temp_dir = None - self.debugging_port = 9222 - - async def start(self) -> str: - """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. - """ - - # Create temp dir if needed - if not self.user_data_dir: - self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") - self.user_data_dir = self.temp_dir - - # Get browser path and args based on OS and browser type - browser_path = self._get_browser_path() - args = self._get_browser_args() - - # Start browser process - try: - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - await asyncio.sleep(2) # Give browser time to start - return f"http://localhost:{self.debugging_port}" - except Exception as e: - await self.cleanup() - raise Exception(f"Failed to start browser: {e}") - - def _get_browser_path(self) -> str: - """Returns the browser executable path based on OS and browser type""" - if sys.platform == "darwin": # macOS - paths = { - "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" - } - elif sys.platform == "win32": # Windows - paths = { - "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None # WebKit not supported on Windows - } - else: # Linux - paths = { - "chromium": "google-chrome", - "firefox": "firefox", - "webkit": None # WebKit not supported on Linux - } - - return paths.get(self.browser_type) - - def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [self._get_browser_path()] - - if self.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.debugging_port}", - f"--user-data-dir={self.user_data_dir}", - ] - if self.headless: - args.append("--headless=new") - elif self.browser_type == "firefox": - args = [ - "--remote-debugging-port", str(self.debugging_port), - "--profile", self.user_data_dir, - ] - if self.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args - - async def cleanup(self): - """Cleanup browser process and temporary directory""" - if self.browser_process: - try: - self.browser_process.terminate() - await asyncio.sleep(1) - if self.browser_process.poll() is None: - self.browser_process.kill() - except Exception as e: - print(f"Error terminating browser: {e}") - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - print(f"Error removing temporary directory: {e}") - -class AsyncCrawlResponse(BaseModel): - html: str - response_headers: Dict[str, str] - status_code: int - screenshot: Optional[str] = None - get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None - - class Config: - arbitrary_types_allowed = True - -class AsyncCrawlerStrategy(ABC): - @abstractmethod - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - pass - - @abstractmethod - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - pass - - @abstractmethod - async def take_screenshot(self, **kwargs) -> str: - pass - - @abstractmethod - def update_user_agent(self, user_agent: str): - pass - - @abstractmethod - def set_hook(self, hook_type: str, hook: Callable): - pass - -class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): - self.use_cached_html = use_cached_html - self.user_agent = kwargs.get( - "user_agent", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" - ) - self.proxy = kwargs.get("proxy") - self.proxy_config = kwargs.get("proxy_config") - self.headless = kwargs.get("headless", True) - self.browser_type = kwargs.get("browser_type", "chromium") - self.headers = kwargs.get("headers", {}) - self.sessions = {} - self.session_ttl = 1800 - self.js_code = js_code - self.verbose = kwargs.get("verbose", False) - self.playwright = None - self.browser = None - self.sleep_on_close = kwargs.get("sleep_on_close", False) - self.use_managed_browser = kwargs.get("use_managed_browser", False) - self.user_data_dir = kwargs.get("user_data_dir", None) - self.use_persistent_context = kwargs.get("use_persistent_context", False) - self.chrome_channel = kwargs.get("chrome_channel", "chrome") - self.managed_browser = None - self.default_context = None - self.hooks = { - 'on_browser_created': None, - 'on_user_agent_updated': None, - 'on_execution_started': None, - 'before_goto': None, - 'after_goto': None, - 'before_return_html': None, - 'before_retrieve_html': None - } - self.extra_args = kwargs.get("extra_args", []) - - async def __aenter__(self): - await self.start() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.close() - - async def start(self): - if self.playwright is None: - self.playwright = await async_playwright().start() - if self.browser is None: - if self.use_managed_browser: - # Use managed browser approach - self.managed_browser = ManagedBrowser( - browser_type=self.browser_type, - user_data_dir=self.user_data_dir, - headless=self.headless - ) - cdp_url = await self.managed_browser.start() - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - - # Get the default context that maintains the user profile - contexts = self.browser.contexts - if contexts: - self.default_context = contexts[0] - else: - # If no default context exists, create one - self.default_context = await self.browser.new_context( - viewport={"width": 1920, "height": 1080} - ) - - # Set up the default context - if self.default_context: - await self.default_context.set_extra_http_headers(self.headers) - - if self.user_agent: - await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent - }) - else: - browser_args = { - "headless": self.headless, - "args": [ - "--disable-gpu", - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", - "--disable-infobars", - "--window-position=0,0", - "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", - # "--disable-http2", - # "--headless=new", # Use the new headless mode - ] - } - - # Add extra args if provided - if self.extra_args: - browser_args["args"].extend(self.extra_args) - - # Add proxy settings if a proxy is specified - if self.proxy: - proxy_settings = ProxySettings(server=self.proxy) - browser_args["proxy"] = proxy_settings - elif self.proxy_config: - proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) - browser_args["proxy"] = proxy_settings - - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) - - # Update the headless configuration - if self.headless: - # Use the new headless mode explicitly - browser_args["args"].append("--headless=new") - - await self.execute_hook('on_browser_created', self.browser) - - async def close(self): - if self.sleep_on_close: - await asyncio.sleep(0.5) - - # Close all active sessions - session_ids = list(self.sessions.keys()) - for session_id in session_ids: - await self.kill_session(session_id) - - if self.browser: - await self.browser.close() - self.browser = None - - if self.managed_browser: - await self.managed_browser.cleanup() - self.managed_browser = None - - if self.playwright: - await self.playwright.stop() - self.playwright = None - - def __del__(self): - if self.browser or self.playwright: - asyncio.get_event_loop().run_until_complete(self.close()) - - def set_hook(self, hook_type: str, hook: Callable): - if hook_type in self.hooks: - self.hooks[hook_type] = hook - else: - raise ValueError(f"Invalid hook type: {hook_type}") - - async def execute_hook(self, hook_type: str, *args): - hook = self.hooks.get(hook_type) - if hook: - if asyncio.iscoroutinefunction(hook): - return await hook(*args) - else: - return hook(*args) - return args[0] if args else None - - def update_user_agent(self, user_agent: str): - self.user_agent = user_agent - - def set_custom_headers(self, headers: Dict[str, str]): - self.headers = headers - - async def kill_session(self, session_id: str): - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() - if not self.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): - current_time = time.time() - expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() - if current_time - last_used > self.session_ttl - ] - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) - - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): - wait_for = wait_for.strip() - - if wait_for.startswith('js:'): - # Explicitly specified JavaScript - js_code = wait_for[3:].strip() - return await self.csp_compliant_wait(page, js_code, timeout) - elif wait_for.startswith('css:'): - # Explicitly specified CSS selector - css_selector = wait_for[4:].strip() - try: - await page.wait_for_selector(css_selector, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") - else: - raise ValueError(f"Invalid CSS selector: '{css_selector}'") - else: - # Auto-detect based on content - if wait_for.startswith('()') or wait_for.startswith('function'): - # It's likely a JavaScript function - return await self.csp_compliant_wait(page, wait_for, timeout) - else: - # Assume it's a CSS selector first - try: - await page.wait_for_selector(wait_for, timeout=timeout) - except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") - else: - # If it's not a timeout error, it might be an invalid selector - # Let's try to evaluate it as a JavaScript function as a fallback - try: - return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) - except Error: - raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " - "It should be either a valid CSS selector, a JavaScript function, " - "or explicitly prefixed with 'js:' or 'css:'.") - - async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): - wrapper_js = f""" - async () => {{ - const userFunction = {user_wait_function}; - const startTime = Date.now(); - while (true) {{ - if (await userFunction()) {{ - return true; - }} - if (Date.now() - startTime > {timeout}) {{ - throw new Error('Timeout waiting for condition'); - }} - await new Promise(resolve => setTimeout(resolve, 100)); - }} - }} - """ - - try: - await page.evaluate(wrapper_js) - except TimeoutError: - raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") - except Exception as e: - raise RuntimeError(f"Error in wait condition: {str(e)}") - - async def process_iframes(self, page): - # Find all iframes - iframes = await page.query_selector_all('iframe') - - for i, iframe in enumerate(iframes): - try: - # Add a unique identifier to the iframe - await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') - - # Get the frame associated with this iframe - frame = await iframe.content_frame() - - if frame: - # Wait for the frame to load - await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout - - # Extract the content of the iframe's body - iframe_content = await frame.evaluate('() => document.body.innerHTML') - - # Generate a unique class name for this iframe - class_name = f'extracted-iframe-content-{i}' - - # Replace the iframe with a div containing the extracted content - _iframe = iframe_content.replace('`', '\\`') - await page.evaluate(f""" - () => {{ - const iframe = document.getElementById('iframe-{i}'); - const div = document.createElement('div'); - div.innerHTML = `{_iframe}`; - div.className = '{class_name}'; - iframe.replaceWith(div); - }} - """) - else: - print(f"Warning: Could not access content frame for iframe {i}") - except Exception as e: - print(f"Error processing iframe {i}: {str(e)}") - - # Return the page object - return page - - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - response_headers = {} - status_code = None - - self._cleanup_expired_sessions() - session_id = kwargs.get("session_id") - - # Handle page creation differently for managed browser - if self.use_managed_browser: - if session_id: - # Reuse existing session if available - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not page: - # Create new page in default context if session doesn't exist - page = await self.default_context.new_page() - self.sessions[session_id] = (self.default_context, page, time.time()) - else: - # Create new page in default context for non-session requests - page = await self.default_context.new_page() - else: - if session_id: - context, page, _ = self.sessions.get(session_id, (None, None, None)) - if not context: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True - ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - await context.set_extra_http_headers(self.headers) - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None - ) - await context.set_extra_http_headers(self.headers) - - if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Inject scripts to override navigator properties - await context.add_init_script(""" - // Pass the Permissions Test. - const originalQuery = window.navigator.permissions.query; - window.navigator.permissions.query = (parameters) => ( - parameters.name === 'notifications' ? - Promise.resolve({ state: Notification.permission }) : - originalQuery(parameters) - ); - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {}, - // Add other properties if necessary - }; - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], - }); - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], - }); - Object.defineProperty(document, 'hidden', { - get: () => false - }); - Object.defineProperty(document, 'visibilityState', { - get: () => 'visible' - }); - """) - - page = await context.new_page() - if kwargs.get("magic", False): - await stealth_async(page, stealth_config) - - # Add console message and error logging - if kwargs.get("log_console", False): - page.on("console", lambda msg: print(f"Console: {msg.text}")) - page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) - - try: - if self.verbose: - print(f"[LOG] πŸ•ΈοΈ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") - - if self.use_cached_html: - cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - if os.path.exists(cache_file_path): - html = "" - with open(cache_file_path, "r") as f: - html = f.read() - # retrieve response headers and status code from cache - with open(cache_file_path + ".meta", "r") as f: - meta = json.load(f) - response_headers = meta.get("response_headers", {}) - status_code = meta.get("status_code") - response = AsyncCrawlResponse( - html=html, response_headers=response_headers, status_code=status_code - ) - return response - - if not kwargs.get("js_only", False): - await self.execute_hook('before_goto', page) - - # response = await page.goto( - # url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) - # ) - - # Add retry logic for HTTP2 errors - max_retries = kwargs.get("max_retries", 3) - current_try = 0 - - while current_try < max_retries: - try: - response = await page.goto( - url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "networkidle"), - timeout=kwargs.get("page_timeout", 60000) - ) - break - except Exception as e: - current_try += 1 - if "ERR_HTTP2_PROTOCOL_ERROR" in str(e): - if current_try < max_retries: - # Add exponential backoff - await asyncio.sleep(2 ** current_try) - # Try with different protocol - if 'args' not in kwargs: - kwargs['args'] = [] - kwargs['args'].extend(['--disable-http2']) - continue - if current_try == max_retries: - raise - - # response = await page.goto("about:blank") - # await page.evaluate(f"window.location.href = '{url}'") - - await self.execute_hook('after_goto', page) - - # Get status code and headers - status_code = response.status - response_headers = response.headers - else: - status_code = 200 - response_headers = {} - - # Replace the current wait_for_selector line with this more robust check: - try: - # First wait for body to exist, regardless of visibility - await page.wait_for_selector('body', state='attached', timeout=30000) - - # Then wait for it to become visible by checking CSS - await page.wait_for_function(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - } - """, timeout=30000) - - except Error as e: - # If waiting fails, let's try to diagnose the issue - visibility_info = await page.evaluate(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return { - display: style.display, - visibility: style.visibility, - opacity: style.opacity, - hasContent: body.innerHTML.length, - classList: Array.from(body.classList) - } - } - """) - - if self.verbose: - print(f"Body visibility debug info: {visibility_info}") - - # Even if body is hidden, we might still want to proceed - if kwargs.get('ignore_body_visibility', True): - if self.verbose: - print("Proceeding despite hidden body...") - pass - else: - raise Error(f"Body element is hidden: {visibility_info}") - - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - - js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) - if js_code: - if isinstance(js_code, str): - await page.evaluate(js_code) - elif isinstance(js_code, list): - for js in js_code: - await page.evaluate(js) - - await page.wait_for_load_state('networkidle') - # Check for on execution event - await self.execute_hook('on_execution_started', page) - - if kwargs.get("simulate_user", False) or kwargs.get("magic", False): - # Simulate user interactions - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press('ArrowDown') - - # Handle the wait_for parameter - wait_for = kwargs.get("wait_for") - if wait_for: - try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) - except Exception as e: - raise RuntimeError(f"Wait condition failed: {str(e)}") - - # Update image dimensions - update_image_dimensions_js = """ - () => { - return new Promise((resolve) => { - const filterImage = (img) => { - // Filter out images that are too small - if (img.width < 100 && img.height < 100) return false; - - // Filter out images that are not visible - const rect = img.getBoundingClientRect(); - if (rect.width === 0 || rect.height === 0) return false; - - // Filter out images with certain class names (e.g., icons, thumbnails) - if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; - - // Filter out images with certain patterns in their src (e.g., placeholder images) - if (img.src.includes('placeholder') || img.src.includes('icon')) return false; - - return true; - }; - - const images = Array.from(document.querySelectorAll('img')).filter(filterImage); - let imagesLeft = images.length; - - if (imagesLeft === 0) { - resolve(); - return; - } - - const checkImage = (img) => { - if (img.complete && img.naturalWidth !== 0) { - img.setAttribute('width', img.naturalWidth); - img.setAttribute('height', img.naturalHeight); - imagesLeft--; - if (imagesLeft === 0) resolve(); - } - }; - - images.forEach(img => { - checkImage(img); - if (!img.complete) { - img.onload = () => { - checkImage(img); - }; - img.onerror = () => { - imagesLeft--; - if (imagesLeft === 0) resolve(); - }; - } - }); - - // Fallback timeout of 5 seconds - // setTimeout(() => resolve(), 5000); - resolve(); - }); - } - """ - await page.evaluate(update_image_dimensions_js) - - # Wait a bit for any onload events to complete - await page.wait_for_timeout(100) - - # Process iframes - if kwargs.get("process_iframes", False): - page = await self.process_iframes(page) - - await self.execute_hook('before_retrieve_html', page) - # Check if delay_before_return_html is set then wait for that time - delay_before_return_html = kwargs.get("delay_before_return_html") - if delay_before_return_html: - await asyncio.sleep(delay_before_return_html) - - # Check for remove_overlay_elements parameter - if kwargs.get("remove_overlay_elements", False): - await self.remove_overlay_elements(page) - - html = await page.content() - await self.execute_hook('before_return_html', page, html) - - # Check if kwargs has screenshot=True then take screenshot - screenshot_data = None - if kwargs.get("screenshot"): - # Check we have screenshot_wait_for parameter, if we have simply wait for that time - screenshot_wait_for = kwargs.get("screenshot_wait_for") - if screenshot_wait_for: - await asyncio.sleep(screenshot_wait_for) - screenshot_data = await self.take_screenshot(page) - - if self.verbose: - print(f"[LOG] βœ… Crawled {url} successfully!") - - if self.use_cached_html: - cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() - ) - with open(cache_file_path, "w", encoding="utf-8") as f: - f.write(html) - # store response headers and status code in cache - with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: - json.dump({ - "response_headers": response_headers, - "status_code": status_code - }, f) - - async def get_delayed_content(delay: float = 5.0) -> str: - if self.verbose: - print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") - await asyncio.sleep(delay) - return await page.content() - - response = AsyncCrawlResponse( - html=html, - response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - get_delayed_content=get_delayed_content - ) - return response - except Error as e: - raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") - # finally: - # if not session_id: - # await page.close() - # await context.close() - - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.crawl(url, **kwargs) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [result if not isinstance(result, Exception) else str(result) for result in results] - - async def remove_overlay_elements(self, page: Page) -> None: - """ - Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. - - Args: - page (Page): The Playwright page instance - """ - remove_overlays_js = """ - async () => { - // Function to check if element is visible - const isVisible = (elem) => { - const style = window.getComputedStyle(elem); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - }; - - // Common selectors for popups and overlays - const commonSelectors = [ - // Close buttons first - 'button[class*="close" i]', 'button[class*="dismiss" i]', - 'button[aria-label*="close" i]', 'button[title*="close" i]', - 'a[class*="close" i]', 'span[class*="close" i]', - - // Cookie notices - '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', - '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', - - // Newsletter/subscription dialogs - '[class*="newsletter" i]', '[class*="subscribe" i]', - - // Generic popups/modals - '[class*="popup" i]', '[class*="modal" i]', - '[class*="overlay" i]', '[class*="dialog" i]', - '[role="dialog"]', '[role="alertdialog"]' - ]; - - // Try to click close buttons first - for (const selector of commonSelectors.slice(0, 6)) { - const closeButtons = document.querySelectorAll(selector); - for (const button of closeButtons) { - if (isVisible(button)) { - try { - button.click(); - await new Promise(resolve => setTimeout(resolve, 100)); - } catch (e) { - console.log('Error clicking button:', e); - } - } - } - } - - // Remove remaining overlay elements - const removeOverlays = () => { - // Find elements with high z-index - const allElements = document.querySelectorAll('*'); - for (const elem of allElements) { - const style = window.getComputedStyle(elem); - const zIndex = parseInt(style.zIndex); - const position = style.position; - - if ( - isVisible(elem) && - (zIndex > 999 || position === 'fixed' || position === 'absolute') && - ( - elem.offsetWidth > window.innerWidth * 0.5 || - elem.offsetHeight > window.innerHeight * 0.5 || - style.backgroundColor.includes('rgba') || - parseFloat(style.opacity) < 1 - ) - ) { - elem.remove(); - } - } - - // Remove elements matching common selectors - for (const selector of commonSelectors) { - const elements = document.querySelectorAll(selector); - elements.forEach(elem => { - if (isVisible(elem)) { - elem.remove(); - } - }); - } - }; - - // Remove overlay elements - removeOverlays(); - - // Remove any fixed/sticky position elements at the top/bottom - const removeFixedElements = () => { - const elements = document.querySelectorAll('*'); - elements.forEach(elem => { - const style = window.getComputedStyle(elem); - if ( - (style.position === 'fixed' || style.position === 'sticky') && - isVisible(elem) - ) { - elem.remove(); - } - }); - }; - - removeFixedElements(); - - // Remove empty block elements as: div, p, span, etc. - const removeEmptyBlockElements = () => { - const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); - blockElements.forEach(elem => { - if (elem.innerText.trim() === '') { - elem.remove(); - } - }); - }; - - // Remove margin-right and padding-right from body (often added by modal scripts) - document.body.style.marginRight = '0px'; - document.body.style.paddingRight = '0px'; - document.body.style.overflow = 'auto'; - - // Wait a bit for any animations to complete - await new Promise(resolve => setTimeout(resolve, 100)); - } - """ - - try: - await page.evaluate(remove_overlays_js) - await page.wait_for_timeout(500) # Wait for any animations to complete - except Exception as e: - if self.verbose: - print(f"Warning: Failed to remove overlay elements: {str(e)}") - - async def take_screenshot(self, page: Page) -> str: - try: - # The page is already loaded, just take the screenshot - screenshot = await page.screenshot(full_page=True) - return base64.b64encode(screenshot).decode('utf-8') - except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) - - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) - - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') - finally: - await page.close() - diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 273ca6c9..c52e3db6 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -91,7 +91,8 @@ class AsyncDatabaseManager: links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", screenshot TEXT DEFAULT "", - response_headers TEXT DEFAULT "{}" -- New column added + response_headers TEXT DEFAULT "{}", + downloaded_files TEXT DEFAULT "{}" -- New column added ) ''') @@ -108,7 +109,7 @@ class AsyncDatabaseManager: column_names = await self.execute_with_retry(_check_columns) # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers'] + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] for column in new_columns: if column not in column_names: @@ -130,7 +131,7 @@ class AsyncDatabaseManager: async def _get(db): async with db.execute( ''' - SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files FROM crawled_data WHERE url = ? ''', (url,) @@ -149,7 +150,8 @@ class AsyncDatabaseManager: json.loads(row[7] or '{}'), # links json.loads(row[8] or '{}'), # metadata row[9], # screenshot - json.loads(row[10] or '{}') # response_headers + json.loads(row[10] or '{}'), # response_headers + json.loads(row[11] or '[]') # downloaded_files ) return None @@ -171,15 +173,16 @@ class AsyncDatabaseManager: links: str = "{}", metadata: str = "{}", screenshot: str = "", - response_headers: str = "{}" # New parameter added + response_headers: str = "{}", + downloaded_files: str = "[]" ): """Cache URL data with retry logic""" async def _cache(db): await db.execute(''' INSERT INTO crawled_data ( - url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers + url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files ) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, @@ -190,8 +193,9 @@ class AsyncDatabaseManager: links = excluded.links, metadata = excluded.metadata, screenshot = excluded.screenshot, - response_headers = excluded.response_headers -- Update response_headers - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers)) + response_headers = excluded.response_headers, -- Update response_headers + downloaded_files = excluded.downloaded_files + ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers, downloaded_files)) try: await self.execute_with_retry(_cache) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8415f9b9..cec1ace0 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -160,12 +160,35 @@ class AsyncWebCrawler: if async_response: crawl_result.status_code = async_response.status_code crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files else: crawl_result.status_code = 200 crawl_result.response_headers = cached[10] + # crawl_result.downloaded_files = cached[11] crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) + + + if not is_raw_html: + if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: + await async_db_manager.acache_url( + url = url, + html = html, + cleaned_html = crawl_result.cleaned_html, + markdown = crawl_result.markdown, + extracted_content = extracted_content, + success = True, + media = json.dumps(crawl_result.media), + links = json.dumps(crawl_result.links), + metadata = json.dumps(crawl_result.metadata), + screenshot=screenshot, + response_headers=json.dumps(crawl_result.response_headers), + downloaded_files=json.dumps(crawl_result.downloaded_files), + + ) + + return crawl_result except Exception as e: @@ -233,8 +256,6 @@ class AsyncWebCrawler: css_selector: str, screenshot: str, verbose: bool, - is_cached: bool, - async_response: Optional[AsyncCrawlResponse], **kwargs, ) -> CrawlResult: t = time.time() @@ -298,28 +319,6 @@ class AsyncWebCrawler: screenshot = None if not screenshot else screenshot - response_headers = "{}" # Default value - if async_response: - # Serialize response_headers dict to JSON string - response_headers = json.dumps(async_response.response_headers, ensure_ascii=False) - - - if not kwargs.get("is_raw_html", False): - if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - response_headers=response_headers, - ) - return CrawlResult( url=url, html=html, diff --git a/crawl4ai/content_cleaning_strategy.py b/crawl4ai/content_cleaning_strategy.py deleted file mode 100644 index b8a5053d..00000000 --- a/crawl4ai/content_cleaning_strategy.py +++ /dev/null @@ -1,198 +0,0 @@ -from bs4 import BeautifulSoup, Tag -import re -from typing import Optional - -class ContentCleaningStrategy: - def __init__(self): - # Precompile regex patterns for performance - self.negative_patterns = re.compile(r'nav|footer|header|sidebar|ads|comment', re.I) - self.positive_patterns = re.compile(r'content|article|main|post', re.I) - self.priority_tags = {'article', 'main', 'section', 'div'} - self.non_content_tags = {'nav', 'footer', 'header', 'aside'} - # Thresholds - self.text_density_threshold = 9.0 - self.min_word_count = 50 - self.link_density_threshold = 0.2 - self.max_dom_depth = 10 # To prevent excessive DOM traversal - - def clean(self, clean_html: str, soup = None) -> str: - """ - Main function that takes cleaned HTML and returns super cleaned HTML. - - Args: - clean_html (str): The cleaned HTML content. - - Returns: - str: The super cleaned HTML containing only the main content. - """ - try: - if not clean_html or not isinstance(clean_html, str): - return '' - if not soup: - # soup = BeautifulSoup(clean_html, 'html.parser') - soup = BeautifulSoup(clean_html, 'lxml') - main_content = self.extract_main_content(soup) - if main_content: - super_clean_element = self.clean_element(main_content) - return super_clean_element.encode_contents().decode('utf-8') - else: - return '' - except Exception: - # Handle exceptions silently or log them as needed - return '' - - def extract_main_content(self, soup) -> Optional[Tag]: - """ - Identifies and extracts the main content element from the HTML. - - Args: - soup (BeautifulSoup): The parsed HTML soup. - - Returns: - Optional[Tag]: The Tag object containing the main content, or None if not found. - """ - candidates = [] - for element in soup.find_all(self.priority_tags): - if self.is_non_content_tag(element): - continue - if self.has_negative_class_id(element): - continue - score = self.calculate_content_score(element) - candidates.append((score, element)) - - if not candidates: - return None - - # Sort candidates by score in descending order - candidates.sort(key=lambda x: x[0], reverse=True) - # Select the element with the highest score - best_element = candidates[0][1] - return best_element - - def calculate_content_score(self, element: Tag) -> float: - """ - Calculates a score for an element based on various heuristics. - - Args: - element (Tag): The HTML element to score. - - Returns: - float: The content score of the element. - """ - score = 0.0 - - if self.is_priority_tag(element): - score += 5.0 - if self.has_positive_class_id(element): - score += 3.0 - if self.has_negative_class_id(element): - score -= 3.0 - if self.is_high_text_density(element): - score += 2.0 - if self.is_low_link_density(element): - score += 2.0 - if self.has_sufficient_content(element): - score += 2.0 - if self.has_headings(element): - score += 3.0 - - dom_depth = self.calculate_dom_depth(element) - score += min(dom_depth, self.max_dom_depth) * 0.5 # Adjust weight as needed - - return score - - def is_priority_tag(self, element: Tag) -> bool: - """Checks if the element is a priority tag.""" - return element.name in self.priority_tags - - def is_non_content_tag(self, element: Tag) -> bool: - """Checks if the element is a non-content tag.""" - return element.name in self.non_content_tags - - def has_negative_class_id(self, element: Tag) -> bool: - """Checks if the element has negative indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.negative_patterns.search(class_id)) - - def has_positive_class_id(self, element: Tag) -> bool: - """Checks if the element has positive indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.positive_patterns.search(class_id)) - - @staticmethod - def get_attr_str(attr) -> str: - """Converts an attribute value to a string.""" - if isinstance(attr, list): - return ' '.join(attr) - elif isinstance(attr, str): - return attr - else: - return '' - - def is_high_text_density(self, element: Tag) -> bool: - """Determines if the element has high text density.""" - text_density = self.calculate_text_density(element) - return text_density > self.text_density_threshold - - def calculate_text_density(self, element: Tag) -> float: - """Calculates the text density of an element.""" - text_length = len(element.get_text(strip=True)) - tag_count = len(element.find_all()) - tag_count = tag_count or 1 # Prevent division by zero - return text_length / tag_count - - def is_low_link_density(self, element: Tag) -> bool: - """Determines if the element has low link density.""" - link_density = self.calculate_link_density(element) - return link_density < self.link_density_threshold - - def calculate_link_density(self, element: Tag) -> float: - """Calculates the link density of an element.""" - text = element.get_text(strip=True) - if not text: - return 0.0 - link_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) - return len(link_text) / len(text) if text else 0.0 - - def has_sufficient_content(self, element: Tag) -> bool: - """Checks if the element has sufficient word count.""" - word_count = len(element.get_text(strip=True).split()) - return word_count >= self.min_word_count - - def calculate_dom_depth(self, element: Tag) -> int: - """Calculates the depth of an element in the DOM tree.""" - depth = 0 - current_element = element - while current_element.parent and depth < self.max_dom_depth: - depth += 1 - current_element = current_element.parent - return depth - - def has_headings(self, element: Tag) -> bool: - """Checks if the element contains heading tags.""" - return bool(element.find(['h1', 'h2', 'h3'])) - - def clean_element(self, element: Tag) -> Tag: - """ - Cleans the selected element by removing unnecessary attributes and nested non-content elements. - - Args: - element (Tag): The HTML element to clean. - - Returns: - Tag: The cleaned HTML element. - """ - for tag in element.find_all(['script', 'style', 'aside']): - tag.decompose() - for tag in element.find_all(): - attrs = dict(tag.attrs) - for attr in attrs: - if attr in ['style', 'onclick', 'onmouseover', 'align', 'bgcolor']: - del tag.attrs[attr] - return element diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py new file mode 100644 index 00000000..850ebf11 --- /dev/null +++ b/crawl4ai/content_filter_strategy.py @@ -0,0 +1,344 @@ +import os +import re +import time +from bs4 import BeautifulSoup, Tag +from typing import List, Tuple, Dict +from rank_bm25 import BM25Okapi +import nltk +from time import perf_counter +from html5lib import parse, treebuilders +from time import perf_counter +from collections import deque +from bs4 import BeautifulSoup, NavigableString, Tag +from .utils import clean_tokens +from abc import ABC, abstractmethod + +class RelevantContentFilter(ABC): + def __init__(self, user_query: str = None): + self.user_query = user_query + self.included_tags = { + # Primary structure + 'article', 'main', 'section', 'div', + # List structures + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + # Text content + 'p', 'span', 'blockquote', 'pre', 'code', + # Headers + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + # Tables + 'table', 'thead', 'tbody', 'tr', 'td', 'th', + # Other semantic elements + 'figure', 'figcaption', 'details', 'summary', + # Text formatting + 'em', 'strong', 'b', 'i', 'mark', 'small', + # Rich content + 'time', 'address', 'cite', 'q' + } + self.excluded_tags = { + 'nav', 'footer', 'header', 'aside', 'script', + 'style', 'form', 'iframe', 'noscript' + } + self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} + self.negative_patterns = re.compile( + r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share', + re.I + ) + self.min_word_count = 2 + + @abstractmethod + def filter_content(self, html: str) -> List[str]: + """Abstract method to be implemented by specific filtering strategies""" + pass + + def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str: + """Common method to extract page metadata with fallbacks""" + if self.user_query: + return self.user_query + + query_parts = [] + + # Title + if soup.title: + query_parts.append(soup.title.string) + elif soup.find('h1'): + query_parts.append(soup.find('h1').get_text()) + + # Meta tags + temp = "" + for meta_name in ['keywords', 'description']: + meta = soup.find('meta', attrs={'name': meta_name}) + if meta and meta.get('content'): + query_parts.append(meta['content']) + temp += meta['content'] + + # If still empty, grab first significant paragraph + if not temp: + # Find the first tag P thatits text contains more than 50 characters + for p in body.find_all('p'): + if len(p.get_text()) > 150: + query_parts.append(p.get_text()[:150]) + break + + return ' '.join(filter(None, query_parts)) + + + def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]: + """ + Extracts text chunks from a BeautifulSoup body element while preserving order. + Returns list of tuples (text, tag_name) for classification. + + Args: + body: BeautifulSoup Tag object representing the body element + + Returns: + List of (text, tag_name) tuples + """ + # Tags to ignore - inline elements that shouldn't break text flow + INLINE_TAGS = { + 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code', + 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q', + 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', + 'textarea', 'time', 'tt', 'var' + } + + # Tags that typically contain meaningful headers + HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'} + + chunks = [] + current_text = [] + chunk_index = 0 + + def should_break_chunk(tag: Tag) -> bool: + """Determine if a tag should cause a break in the current text chunk""" + return ( + tag.name not in INLINE_TAGS + and not (tag.name == 'p' and len(current_text) == 0) + ) + + # Use deque for efficient push/pop operations + stack = deque([(body, False)]) + + while stack: + element, visited = stack.pop() + + if visited: + # End of block element - flush accumulated text + if current_text and should_break_chunk(element): + text = ' '.join(''.join(current_text).split()) + if text: + tag_type = 'header' if element.name in HEADER_TAGS else 'content' + chunks.append((chunk_index, text, tag_type, element)) + chunk_index += 1 + current_text = [] + continue + + if isinstance(element, NavigableString): + if str(element).strip(): + current_text.append(str(element).strip()) + continue + + # Pre-allocate children to avoid multiple list operations + children = list(element.children) + if not children: + continue + + # Mark block for revisit after processing children + stack.append((element, True)) + + # Add children in reverse order for correct processing + for child in reversed(children): + if isinstance(child, (Tag, NavigableString)): + stack.append((child, False)) + + # Handle any remaining text + if current_text: + text = ' '.join(''.join(current_text).split()) + if text: + chunks.append((chunk_index, text, 'content', body)) + + return chunks + + + def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: + """Common method for extracting text chunks""" + _text_cache = {} + def fast_text(element: Tag) -> str: + elem_id = id(element) + if elem_id in _text_cache: + return _text_cache[elem_id] + texts = [] + for content in element.contents: + if isinstance(content, str): + text = content.strip() + if text: + texts.append(text) + result = ' '.join(texts) + _text_cache[elem_id] = result + return result + + candidates = [] + index = 0 + + def dfs(element): + nonlocal index + if isinstance(element, Tag): + if element.name in self.included_tags: + if not self.is_excluded(element): + text = fast_text(element) + word_count = len(text.split()) + + # Headers pass through with adjusted minimum + if element.name in self.header_tags: + if word_count >= 3: # Minimal sanity check for headers + candidates.append((index, text, element)) + index += 1 + # Regular content uses standard minimum + elif word_count >= self.min_word_count: + candidates.append((index, text, element)) + index += 1 + + for child in element.children: + dfs(child) + + dfs(soup.body if soup.body else soup) + return candidates + + def is_excluded(self, tag: Tag) -> bool: + """Common method for exclusion logic""" + if tag.name in self.excluded_tags: + return True + class_id = ' '.join(filter(None, [ + ' '.join(tag.get('class', [])), + tag.get('id', '') + ])) + return bool(self.negative_patterns.search(class_id)) + + def clean_element(self, tag: Tag) -> str: + """Common method for cleaning HTML elements with minimal overhead""" + if not tag or not isinstance(tag, Tag): + return "" + + unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'} + unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'} + + # Use string builder pattern for better performance + builder = [] + + def render_tag(elem): + if not isinstance(elem, Tag): + if isinstance(elem, str): + builder.append(elem.strip()) + return + + if elem.name in unwanted_tags: + return + + # Start tag + builder.append(f'<{elem.name}') + + # Add cleaned attributes + attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs} + for key, value in attrs.items(): + builder.append(f' {key}="{value}"') + + builder.append('>') + + # Process children + for child in elem.children: + render_tag(child) + + # Close tag + builder.append(f'') + + try: + render_tag(tag) + return ''.join(builder) + except Exception: + return str(tag) # Fallback to original if anything fails + +class BM25ContentFilter(RelevantContentFilter): + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0): + super().__init__(user_query=user_query) + self.bm25_threshold = bm25_threshold + self.priority_tags = { + 'h1': 5.0, + 'h2': 4.0, + 'h3': 3.0, + 'title': 4.0, + 'strong': 2.0, + 'b': 1.5, + 'em': 1.5, + 'blockquote': 2.0, + 'code': 2.0, + 'pre': 1.5, + 'th': 1.5, # Table headers + } + + def filter_content(self, html: str) -> List[str]: + """Implements content filtering using BM25 algorithm with priority tag handling""" + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + body = soup.find('body') + query = self.extract_page_query(soup.find('head'), body) + candidates = self.extract_text_chunks(body) + + if not candidates: + return [] + + # Split into priority and regular candidates + priority_candidates = [] + regular_candidates = [] + + for index, chunk, tag_type, tag in candidates: + if tag.name in self.priority_tags: + priority_candidates.append((index, chunk, tag_type, tag)) + else: + regular_candidates.append((index, chunk, tag_type, tag)) + + # Process regular content with BM25 + tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates] + tokenized_query = query.lower().split() + + # Clean from stop words and noise + tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] + tokenized_query = clean_tokens(tokenized_query) + + bm25 = BM25Okapi(tokenized_corpus) + scores = bm25.get_scores(tokenized_query) + + # Score and boost regular candidates + scored_candidates = [ + (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag) + for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates) + ] + scored_candidates.sort(key=lambda x: x[0], reverse=True) + + # Process scored candidates + selected_tags = set() + selected_candidates = [] + + # First add all priority candidates + for index, chunk, tag_type, tag in priority_candidates: + tag_id = id(tag) + if tag_id not in selected_tags: + selected_candidates.append((index, chunk, tag)) + selected_tags.add(tag_id) + + # Then add scored regular candidates that meet threshold + for score, index, chunk, tag_type, tag in scored_candidates: + if score < self.bm25_threshold: + continue + tag_id = id(tag) + if tag_id not in selected_tags: + selected_candidates.append((index, chunk, tag)) + selected_tags.add(tag_id) + + if not selected_candidates: + return [] + + # Sort by original document order + selected_candidates.sort(key=lambda x: x[0]) + return [self.clean_element(tag) for _, _, tag in selected_candidates] + diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index a2dbbd96..9c81638c 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -8,7 +8,8 @@ from .config import * from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema -from .content_cleaning_strategy import ContentCleaningStrategy +# from .content_cleaning_strategy import ContentCleaningStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter from .utils import ( sanitize_input_encode, @@ -532,8 +533,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." if kwargs.get('fit_markdown', False): - cleaner = ContentCleaningStrategy() - fit_html = cleaner.clean(cleaned_html) + # cleaner = ContentCleaningStrategy() + # fit_html = cleaner.clean(cleaned_html) + # fit_markdown = h.handle(fit_html) + content_filter = BM25ContentFilter( + user_query= kwargs.get('fit_markdown_user_query', None), + bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + fit_html = content_filter.filter_content(html) + fit_html = '\n'.join('
{}
'.format(s) for s in fit_html) fit_markdown = h.handle(fit_html) cleaned_html = sanitize_html(cleaned_html) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 4ac06797..cab4c45b 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,5 +1,7 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional +from typing import List, Dict, Optional, Callable, Awaitable + + class UrlModel(BaseModel): url: HttpUrl @@ -12,6 +14,7 @@ class CrawlResult(BaseModel): cleaned_html: Optional[str] = None media: Dict[str, List[Dict]] = {} links: Dict[str, List[Dict]] = {} + downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None markdown: Optional[str] = None fit_markdown: Optional[str] = None @@ -21,4 +24,15 @@ class CrawlResult(BaseModel): error_message: Optional[str] = None session_id: Optional[str] = None response_headers: Optional[dict] = None - status_code: Optional[int] = None \ No newline at end of file + status_code: Optional[int] = None + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + downloaded_files: Optional[List[str]] = None + + class Config: + arbitrary_types_allowed = True diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d8bd6992..49483f43 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1054,3 +1054,58 @@ def is_external_url(url, base_domain): return False return False + +def clean_tokens(tokens: list[str]) -> list[str]: + # Set of tokens to remove + noise = {'ccp', 'up', '↑', 'β–²', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'} + + STOP_WORDS = { + 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', + 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', + 'to', 'was', 'were', 'will', 'with', + + # Pronouns + 'i', 'you', 'he', 'she', 'it', 'we', 'they', + 'me', 'him', 'her', 'us', 'them', + 'my', 'your', 'his', 'her', 'its', 'our', 'their', + 'mine', 'yours', 'hers', 'ours', 'theirs', + 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', + + # Common verbs + 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', + + # Prepositions + 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', + 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', + 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', + 'near', 'of', 'off', 'on', 'out', 'outside', 'over', 'past', 'through', + 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', + + # Conjunctions + 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', + 'although', 'because', 'since', 'unless', + + # Articles + 'a', 'an', 'the', + + # Other common words + 'this', 'that', 'these', 'those', + 'what', 'which', 'who', 'whom', 'whose', + 'when', 'where', 'why', 'how', + 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', + 'can', 'cannot', "can't", 'could', "couldn't", + 'may', 'might', 'must', "mustn't", + 'shall', 'should', "shouldn't", + 'will', "won't", 'would', "wouldn't", + 'not', "n't", 'no', 'nor', 'none' + } + + # Single comprehension, more efficient than multiple passes + return [token for token in tokens + if len(token) > 2 + and token not in noise + and token not in STOP_WORDS + and not token.startswith('↑') + and not token.startswith('β–²') + and not token.startswith('⬆')] diff --git a/tests/async/test_async_doanloader.py b/tests/async/test_async_doanloader.py new file mode 100644 index 00000000..4798b4ca --- /dev/null +++ b/tests/async/test_async_doanloader.py @@ -0,0 +1,229 @@ +import os +import sys +import asyncio +import shutil +from typing import List +import tempfile +import time + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.async_webcrawler import AsyncWebCrawler + +class TestDownloads: + def __init__(self): + self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_") + self.download_dir = os.path.join(self.temp_dir, "downloads") + os.makedirs(self.download_dir, exist_ok=True) + self.results: List[str] = [] + + def cleanup(self): + shutil.rmtree(self.temp_dir) + + def log_result(self, test_name: str, success: bool, message: str = ""): + result = f"{'βœ…' if success else '❌'} {test_name}: {message}" + self.results.append(result) + print(result) + + async def test_basic_download(self): + """Test basic file download functionality""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + # Python.org downloads page typically has stable download links + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click first download link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Basic Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Basic Download", False, str(e)) + + async def test_persistent_context_download(self): + """Test downloads with persistent context""" + try: + user_data_dir = os.path.join(self.temp_dir, "user_data") + os.makedirs(user_data_dir, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + use_persistent_context=True, + user_data_dir=user_data_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Persistent Context Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Persistent Context Download", False, str(e)) + + async def test_multiple_downloads(self): + """Test multiple simultaneous downloads""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click multiple download links + const downloadLinks = document.querySelectorAll('a[href$=".exe"]'); + downloadLinks.forEach(link => link.click()); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 1 + self.log_result( + "Multiple Downloads", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded" + ) + except Exception as e: + self.log_result("Multiple Downloads", False, str(e)) + + async def test_different_browsers(self): + """Test downloads across different browser types""" + browsers = ["chromium", "firefox", "webkit"] + + for browser_type in browsers: + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + browser_type=browser_type, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + f"{browser_type.title()} Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result(f"{browser_type.title()} Download", False, str(e)) + + async def test_edge_cases(self): + """Test various edge cases""" + + # Test 1: Downloads without specifying download path + try: + async with AsyncWebCrawler( + accept_downloads=True, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result( + "Default Download Path", + True, + f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}" + ) + except Exception as e: + self.log_result("Default Download Path", False, str(e)) + + # Test 2: Downloads with invalid path + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path="/invalid/path/that/doesnt/exist", + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result("Invalid Download Path", False, "Should have raised an error") + except Exception as e: + self.log_result("Invalid Download Path", True, "Correctly handled invalid path") + + # Test 3: Download with accept_downloads=False + try: + async with AsyncWebCrawler( + accept_downloads=False, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + success = result.downloaded_files is None + self.log_result( + "Disabled Downloads", + success, + "Correctly ignored downloads" if success else "Unexpectedly downloaded files" + ) + except Exception as e: + self.log_result("Disabled Downloads", False, str(e)) + + async def run_all_tests(self): + """Run all test cases""" + print("\nπŸ§ͺ Running Download Tests...\n") + + test_methods = [ + self.test_basic_download, + self.test_persistent_context_download, + self.test_multiple_downloads, + self.test_different_browsers, + self.test_edge_cases + ] + + for test in test_methods: + print(f"\nπŸ“ Running {test.__doc__}...") + await test() + await asyncio.sleep(2) # Brief pause between tests + + print("\nπŸ“Š Test Results Summary:") + for result in self.results: + print(result) + + successes = len([r for r in self.results if 'βœ…' in r]) + total = len(self.results) + print(f"\nTotal: {successes}/{total} tests passed") + + self.cleanup() + +async def main(): + tester = TestDownloads() + await tester.run_all_tests() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file