Improved database management and error handling, updated README instructions, refined .gitignore, enhanced async web crawling capabilities, and updated dependencies.

2024-11-04 13:22:13 +08:00
parent 62a86dbe8d
commit 54d5a3a259
11 changed files with 461 additions and 669 deletions
--- a/crawl4ai/async_crawler_strategy
+++ b/crawl4ai/async_crawler_strategy
@@ -1,558 +0,0 @@
-import asyncio
-import base64
-import time
-from abc import ABC, abstractmethod
-from typing import Callable, Dict, Any, List, Optional, Awaitable
-import os
-from playwright.async_api import async_playwright, Page, Browser, Error
-from io import BytesIO
-from PIL import Image, ImageDraw, ImageFont
-from pathlib import Path
-from playwright.async_api import ProxySettings
-from pydantic import BaseModel
-import hashlib
-import json
-import uuid
-from playwright_stealth import stealth_async
-
-class AsyncCrawlResponse(BaseModel):
-    html: str
-    response_headers: Dict[str, str]
-    status_code: int
-    screenshot: Optional[str] = None
-    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
-
-    class Config:
-        arbitrary_types_allowed = True
-
-class AsyncCrawlerStrategy(ABC):
-    @abstractmethod
-    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
-        pass
-    
-    @abstractmethod
-    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
-        pass
-    
-    @abstractmethod
-    async def take_screenshot(self, url: str) -> str:
-        pass
-    
-    @abstractmethod
-    def update_user_agent(self, user_agent: str):
-        pass
-    
-    @abstractmethod
-    def set_hook(self, hook_type: str, hook: Callable):
-        pass
-
-class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
-    def __init__(self, use_cached_html=False, js_code=None, **kwargs):
-        self.use_cached_html = use_cached_html
-        self.user_agent = kwargs.get(
-            "user_agent",
-            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
-            "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-        )
-        self.proxy = kwargs.get("proxy")
-        self.headless = kwargs.get("headless", True)
-        self.browser_type = kwargs.get("browser_type", "chromium")
-        self.headers = kwargs.get("headers", {})
-        self.sessions = {}
-        self.session_ttl = 1800 
-        self.js_code = js_code
-        self.verbose = kwargs.get("verbose", False)
-        self.playwright = None
-        self.browser = None
-        self.hooks = {
-            'on_browser_created': None,
-            'on_user_agent_updated': None,
-            'on_execution_started': None,
-            'before_goto': None,
-            'after_goto': None,
-            'before_return_html': None,
-            'before_retrieve_html': None
-        }
-
-    async def __aenter__(self):
-        await self.start()
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        await self.close()
-
-    async def start(self):
-        if self.playwright is None:
-            self.playwright = await async_playwright().start()
-        if self.browser is None:
-            browser_args = {
-                "headless": self.headless,
-                "args": [
-                    "--disable-gpu",
-                    "--no-sandbox",
-                    "--disable-dev-shm-usage",
-                    "--disable-blink-features=AutomationControlled",
-                    "--disable-infobars",
-                    "--window-position=0,0",
-                    "--ignore-certificate-errors",
-                    "--ignore-certificate-errors-spki-list",
-                    # "--headless=new",  # Use the new headless mode
-                ]
-            }
-            
-            # Add proxy settings if a proxy is specified
-            if self.proxy:
-                proxy_settings = ProxySettings(server=self.proxy)
-                browser_args["proxy"] = proxy_settings
-                
-            # Select the appropriate browser based on the browser_type
-            if self.browser_type == "firefox":
-                self.browser = await self.playwright.firefox.launch(**browser_args)
-            elif self.browser_type == "webkit":
-                self.browser = await self.playwright.webkit.launch(**browser_args)
-            else:
-                self.browser = await self.playwright.chromium.launch(**browser_args)
-
-            await self.execute_hook('on_browser_created', self.browser)
-
-    async def close(self):
-        if self.browser:
-            await self.browser.close()
-            self.browser = None
-        if self.playwright:
-            await self.playwright.stop()
-            self.playwright = None
-
-    def __del__(self):
-        if self.browser or self.playwright:
-            asyncio.get_event_loop().run_until_complete(self.close())
-
-    def set_hook(self, hook_type: str, hook: Callable):
-        if hook_type in self.hooks:
-            self.hooks[hook_type] = hook
-        else:
-            raise ValueError(f"Invalid hook type: {hook_type}")
-
-    async def execute_hook(self, hook_type: str, *args):
-        hook = self.hooks.get(hook_type)
-        if hook:
-            if asyncio.iscoroutinefunction(hook):
-                return await hook(*args)
-            else:
-                return hook(*args)
-        return args[0] if args else None
-
-    def update_user_agent(self, user_agent: str):
-        self.user_agent = user_agent
-
-    def set_custom_headers(self, headers: Dict[str, str]):
-        self.headers = headers
-
-    async def kill_session(self, session_id: str):
-        if session_id in self.sessions:
-            context, page, _ = self.sessions[session_id]
-            await page.close()
-            await context.close()
-            del self.sessions[session_id]
-
-    def _cleanup_expired_sessions(self):
-        current_time = time.time()
-        expired_sessions = [
-            sid for sid, (_, _, last_used) in self.sessions.items() 
-            if current_time - last_used > self.session_ttl
-        ]
-        for sid in expired_sessions:
-            asyncio.create_task(self.kill_session(sid))
-            
-    async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
-        wait_for = wait_for.strip()
-        
-        if wait_for.startswith('js:'):
-            # Explicitly specified JavaScript
-            js_code = wait_for[3:].strip()
-            return await self.csp_compliant_wait(page, js_code, timeout)
-        elif wait_for.startswith('css:'):
-            # Explicitly specified CSS selector
-            css_selector = wait_for[4:].strip()
-            try:
-                await page.wait_for_selector(css_selector, timeout=timeout)
-            except Error as e:
-                if 'Timeout' in str(e):
-                    raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
-                else:
-                    raise ValueError(f"Invalid CSS selector: '{css_selector}'")
-        else:
-            # Auto-detect based on content
-            if wait_for.startswith('()') or wait_for.startswith('function'):
-                # It's likely a JavaScript function
-                return await self.csp_compliant_wait(page, wait_for, timeout)
-            else:
-                # Assume it's a CSS selector first
-                try:
-                    await page.wait_for_selector(wait_for, timeout=timeout)
-                except Error as e:
-                    if 'Timeout' in str(e):
-                        raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
-                    else:
-                        # If it's not a timeout error, it might be an invalid selector
-                        # Let's try to evaluate it as a JavaScript function as a fallback
-                        try:
-                            return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
-                        except Error:
-                            raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
-                                             "It should be either a valid CSS selector, a JavaScript function, "
-                                             "or explicitly prefixed with 'js:' or 'css:'.")
-    
-    async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
-        wrapper_js = f"""
-        async () => {{
-            const userFunction = {user_wait_function};
-            const startTime = Date.now();
-            while (true) {{
-                if (await userFunction()) {{
-                    return true;
-                }}
-                if (Date.now() - startTime > {timeout}) {{
-                    throw new Error('Timeout waiting for condition');
-                }}
-                await new Promise(resolve => setTimeout(resolve, 100));
-            }}
-        }}
-        """
-        
-        try:
-            await page.evaluate(wrapper_js)
-        except TimeoutError:
-            raise TimeoutError(f"Timeout after {timeout}ms waiting for condition")
-        except Exception as e:
-            raise RuntimeError(f"Error in wait condition: {str(e)}")
-
-    async def process_iframes(self, page):
-        # Find all iframes
-        iframes = await page.query_selector_all('iframe')
-        
-        for i, iframe in enumerate(iframes):
-            try:
-                # Add a unique identifier to the iframe
-                await iframe.evaluate(f'(element) => element.id = "iframe-{i}"')
-                
-                # Get the frame associated with this iframe
-                frame = await iframe.content_frame()
-                
-                if frame:
-                    # Wait for the frame to load
-                    await frame.wait_for_load_state('load', timeout=30000)  # 30 seconds timeout
-                    
-                    # Extract the content of the iframe's body
-                    iframe_content = await frame.evaluate('() => document.body.innerHTML')
-                    
-                    # Generate a unique class name for this iframe
-                    class_name = f'extracted-iframe-content-{i}'
-                    
-                    # Replace the iframe with a div containing the extracted content
-                    _iframe = iframe_content.replace('`', '\\`')
-                    await page.evaluate(f"""
-                        () => {{
-                            const iframe = document.getElementById('iframe-{i}');
-                            const div = document.createElement('div');
-                            div.innerHTML = `{_iframe}`;
-                            div.className = '{class_name}';
-                            iframe.replaceWith(div);
-                        }}
-                    """)
-                else:
-                    print(f"Warning: Could not access content frame for iframe {i}")
-            except Exception as e:
-                print(f"Error processing iframe {i}: {str(e)}")
-
-        # Return the page object
-        return page  
-    
-    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
-        response_headers = {}
-        status_code = None
-        
-        self._cleanup_expired_sessions()
-        session_id = kwargs.get("session_id")
-        if session_id:
-            context, page, _ = self.sessions.get(session_id, (None, None, None))
-            if not context:
-                context = await self.browser.new_context(
-                    user_agent=self.user_agent,
-                    viewport={"width": 1920, "height": 1080},
-                    proxy={"server": self.proxy} if self.proxy else None
-                )
-                await context.set_extra_http_headers(self.headers)
-                page = await context.new_page()
-                self.sessions[session_id] = (context, page, time.time())
-        else:
-            context = await self.browser.new_context(
-                user_agent=self.user_agent,
-                viewport={"width": 1920, "height": 1080},
-                proxy={"server": self.proxy} if self.proxy else None
-            )
-            await context.set_extra_http_headers(self.headers)
-            
-            if kwargs.get("override_navigator", False):
-                # Inject scripts to override navigator properties
-                await context.add_init_script("""
-                    // Pass the Permissions Test.
-                    const originalQuery = window.navigator.permissions.query;
-                    window.navigator.permissions.query = (parameters) => (
-                        parameters.name === 'notifications' ?
-                            Promise.resolve({ state: Notification.permission }) :
-                            originalQuery(parameters)
-                    );
-                    Object.defineProperty(navigator, 'webdriver', {
-                        get: () => undefined
-                    });
-                    window.navigator.chrome = {
-                        runtime: {},
-                        // Add other properties if necessary
-                    };
-                    Object.defineProperty(navigator, 'plugins', {
-                        get: () => [1, 2, 3, 4, 5],
-                    });
-                    Object.defineProperty(navigator, 'languages', {
-                        get: () => ['en-US', 'en'],
-                    });
-                    Object.defineProperty(document, 'hidden', {
-                        get: () => false
-                    });
-                    Object.defineProperty(document, 'visibilityState', {
-                        get: () => 'visible'
-                    });
-                """)
-            
-            page = await context.new_page()
-
-        try:
-            if self.verbose:
-                print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")
-
-            if self.use_cached_html:
-                cache_file_path = os.path.join(
-                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
-                )
-                if os.path.exists(cache_file_path):
-                    html = ""
-                    with open(cache_file_path, "r") as f:
-                        html = f.read()
-                    # retrieve response headers and status code from cache
-                    with open(cache_file_path + ".meta", "r") as f:
-                        meta = json.load(f)
-                        response_headers = meta.get("response_headers", {})
-                        status_code = meta.get("status_code")
-                    response = AsyncCrawlResponse(
-                        html=html, response_headers=response_headers, status_code=status_code
-                    )
-                    return response
-
-            if not kwargs.get("js_only", False):
-                await self.execute_hook('before_goto', page)
-                
-                response = await page.goto("about:blank")
-                await stealth_async(page)
-                response = await page.goto(
-                    url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
-                )
-                
-                # await stealth_async(page)
-                # response = await page.goto("about:blank")
-                # await stealth_async(page)
-                # await page.evaluate(f"window.location.href = '{url}'")
-                
-                await self.execute_hook('after_goto', page)
-                
-                # Get status code and headers
-                status_code = response.status
-                response_headers = response.headers
-            else:
-                status_code = 200
-                response_headers = {}
-
-            await page.wait_for_selector('body')
-            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
-
-            js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
-            if js_code:
-                if isinstance(js_code, str):
-                    await page.evaluate(js_code)
-                elif isinstance(js_code, list):
-                    for js in js_code:
-                        await page.evaluate(js)
-                
-                await page.wait_for_load_state('networkidle')
-                # Check for on execution event
-                await self.execute_hook('on_execution_started', page)
-                
-            if kwargs.get("simulate_user", False):
-                # Simulate user interactions
-                await page.mouse.move(100, 100)
-                await page.mouse.down()
-                await page.mouse.up()
-                await page.keyboard.press('ArrowDown')
-
-            # Handle the wait_for parameter
-            wait_for = kwargs.get("wait_for")
-            if wait_for:
-                try:
-                    await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
-                except Exception as e:
-                    raise RuntimeError(f"Wait condition failed: {str(e)}")
-
-
-            
-            # Update image dimensions
-            update_image_dimensions_js = """
-            () => {
-                return new Promise((resolve) => {
-                    const filterImage = (img) => {
-                        // Filter out images that are too small
-                        if (img.width < 100 && img.height < 100) return false;
-                        
-                        // Filter out images that are not visible
-                        const rect = img.getBoundingClientRect();
-                        if (rect.width === 0 || rect.height === 0) return false;
-                        
-                        // Filter out images with certain class names (e.g., icons, thumbnails)
-                        if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false;
-                        
-                        // Filter out images with certain patterns in their src (e.g., placeholder images)
-                        if (img.src.includes('placeholder') || img.src.includes('icon')) return false;
-                        
-                        return true;
-                    };
-
-                    const images = Array.from(document.querySelectorAll('img')).filter(filterImage);
-                    let imagesLeft = images.length;
-                    
-                    if (imagesLeft === 0) {
-                        resolve();
-                        return;
-                    }
-
-                    const checkImage = (img) => {
-                        if (img.complete && img.naturalWidth !== 0) {
-                            img.setAttribute('width', img.naturalWidth);
-                            img.setAttribute('height', img.naturalHeight);
-                            imagesLeft--;
-                            if (imagesLeft === 0) resolve();
-                        }
-                    };
-
-                    images.forEach(img => {
-                        checkImage(img);
-                        if (!img.complete) {
-                            img.onload = () => {
-                                checkImage(img);
-                            };
-                            img.onerror = () => {
-                                imagesLeft--;
-                                if (imagesLeft === 0) resolve();
-                            };
-                        }
-                    });
-
-                    // Fallback timeout of 5 seconds
-                    setTimeout(() => resolve(), 5000);
-                });
-            }
-            """
-            await page.evaluate(update_image_dimensions_js)
-
-            # Wait a bit for any onload events to complete
-            await page.wait_for_timeout(100)
-
-            # Process iframes
-            if kwargs.get("process_iframes", False):
-                page = await self.process_iframes(page)
-            
-            await self.execute_hook('before_retrieve_html', page)
-            # Check if delay_before_return_html is set then wait for that time
-            delay_before_return_html = kwargs.get("delay_before_return_html")
-            if delay_before_return_html:
-                await asyncio.sleep(delay_before_return_html)
-                
-            html = await page.content()
-            await self.execute_hook('before_return_html', page, html)
-            
-            # Check if kwargs has screenshot=True then take screenshot
-            screenshot_data = None
-            if kwargs.get("screenshot"):
-                screenshot_data = await self.take_screenshot(url)            
-
-            if self.verbose:
-                print(f"[LOG] ✅ Crawled {url} successfully!")
-
-            if self.use_cached_html:
-                cache_file_path = os.path.join(
-                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
-                )
-                with open(cache_file_path, "w", encoding="utf-8") as f:
-                    f.write(html)
-                # store response headers and status code in cache
-                with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
-                    json.dump({
-                        "response_headers": response_headers,
-                        "status_code": status_code
-                    }, f)
-
-            async def get_delayed_content(delay: float = 5.0) -> str:
-                if self.verbose:
-                    print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
-                await asyncio.sleep(delay)
-                return await page.content()
-                
-            response = AsyncCrawlResponse(
-                html=html, 
-                response_headers=response_headers, 
-                status_code=status_code,
-                screenshot=screenshot_data,
-                get_delayed_content=get_delayed_content
-            )
-            return response
-        except Error as e:
-            raise Error(f"Failed to crawl {url}: {str(e)}")
-        finally:
-            if not session_id:
-                await page.close()
-                await context.close()
-
-    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
-        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
-        semaphore = asyncio.Semaphore(semaphore_count)
-
-        async def crawl_with_semaphore(url):
-            async with semaphore:
-                return await self.crawl(url, **kwargs)
-
-        tasks = [crawl_with_semaphore(url) for url in urls]
-        results = await asyncio.gather(*tasks, return_exceptions=True)
-        return [result if not isinstance(result, Exception) else str(result) for result in results]
-
-    async def take_screenshot(self, url: str, wait_time=1000) -> str:
-        async with await self.browser.new_context(user_agent=self.user_agent) as context:
-            page = await context.new_page()
-            try:
-                await page.goto(url, wait_until="domcontentloaded", timeout=30000)
-                # Wait for a specified time (default is 1 second)
-                await page.wait_for_timeout(wait_time)
-                screenshot = await page.screenshot(full_page=True)
-                return base64.b64encode(screenshot).decode('utf-8')
-            except Exception as e:
-                error_message = f"Failed to take screenshot: {str(e)}"
-                print(error_message)
-
-                # Generate an error image
-                img = Image.new('RGB', (800, 600), color='black')
-                draw = ImageDraw.Draw(img)
-                font = ImageFont.load_default()
-                draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
-                
-                buffered = BytesIO()
-                img.save(buffered, format="JPEG")
-                return base64.b64encode(buffered.getvalue()).decode('utf-8')
-            finally:
-                await page.close()
-
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -3,7 +3,8 @@ import base64
 import time
 from abc import ABC, abstractmethod
 from typing import Callable, Dict, Any, List, Optional, Awaitable
-import os
+import os, sys, shutil
+import tempfile, subprocess
 from playwright.async_api import async_playwright, Page, Browser, Error
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
@@ -13,6 +14,7 @@ from pydantic import BaseModel
 import hashlib
 import json
 import uuid
+
 from playwright_stealth import StealthConfig, stealth_async

 stealth_config = StealthConfig(
@@ -31,6 +33,106 @@ stealth_config = StealthConfig(
 )


+class ManagedBrowser:
+    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False):
+        self.browser_type = browser_type
+        self.user_data_dir = user_data_dir
+        self.headless = headless
+        self.browser_process = None
+        self.temp_dir = None
+        self.debugging_port = 9222
+
+    async def start(self) -> str:
+        """
+        Starts the browser process and returns the CDP endpoint URL.
+        If user_data_dir is not provided, creates a temporary directory.
+        """
+        
+        # Create temp dir if needed
+        if not self.user_data_dir:
+            self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
+            self.user_data_dir = self.temp_dir
+
+        # Get browser path and args based on OS and browser type
+        browser_path = self._get_browser_path()
+        args = self._get_browser_args()
+
+        # Start browser process
+        try:
+            self.browser_process = subprocess.Popen(
+                args,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE
+            )
+            await asyncio.sleep(2)  # Give browser time to start
+            return f"http://localhost:{self.debugging_port}"
+        except Exception as e:
+            await self.cleanup()
+            raise Exception(f"Failed to start browser: {e}")
+
+    def _get_browser_path(self) -> str:
+        """Returns the browser executable path based on OS and browser type"""
+        if sys.platform == "darwin":  # macOS
+            paths = {
+                "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
+                "webkit": "/Applications/Safari.app/Contents/MacOS/Safari"
+            }
+        elif sys.platform == "win32":  # Windows
+            paths = {
+                "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+                "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
+                "webkit": None  # WebKit not supported on Windows
+            }
+        else:  # Linux
+            paths = {
+                "chromium": "google-chrome",
+                "firefox": "firefox",
+                "webkit": None  # WebKit not supported on Linux
+            }
+        
+        return paths.get(self.browser_type)
+
+    def _get_browser_args(self) -> List[str]:
+        """Returns browser-specific command line arguments"""
+        base_args = [self._get_browser_path()]
+        
+        if self.browser_type == "chromium":
+            args = [
+                f"--remote-debugging-port={self.debugging_port}",
+                f"--user-data-dir={self.user_data_dir}",
+            ]
+            if self.headless:
+                args.append("--headless=new")
+        elif self.browser_type == "firefox":
+            args = [
+                "--remote-debugging-port", str(self.debugging_port),
+                "--profile", self.user_data_dir,
+            ]
+            if self.headless:
+                args.append("--headless")
+        else:
+            raise NotImplementedError(f"Browser type {self.browser_type} not supported")
+            
+        return base_args + args
+
+    async def cleanup(self):
+        """Cleanup browser process and temporary directory"""
+        if self.browser_process:
+            try:
+                self.browser_process.terminate()
+                await asyncio.sleep(1)
+                if self.browser_process.poll() is None:
+                    self.browser_process.kill()
+            except Exception as e:
+                print(f"Error terminating browser: {e}")
+
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            try:
+                shutil.rmtree(self.temp_dir)
+            except Exception as e:
+                print(f"Error removing temporary directory: {e}")
+
 class AsyncCrawlResponse(BaseModel):
    html: str
    response_headers: Dict[str, str]
@@ -82,6 +184,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        self.playwright = None
        self.browser = None
        self.sleep_on_close = kwargs.get("sleep_on_close", False)
+        self.use_managed_browser = kwargs.get("use_managed_browser", False)
+        self.user_data_dir = kwargs.get("user_data_dir", None)
+        self.managed_browser = None
        self.hooks = {
            'on_browser_created': None,
            'on_user_agent_updated': None,
@@ -103,36 +208,46 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        if self.playwright is None:
            self.playwright = await async_playwright().start()
        if self.browser is None:
-            browser_args = {
-                "headless": self.headless,
-                "args": [
-                    "--disable-gpu",
-                    "--no-sandbox",
-                    "--disable-dev-shm-usage",
-                    "--disable-blink-features=AutomationControlled",
-                    "--disable-infobars",
-                    "--window-position=0,0",
-                    "--ignore-certificate-errors",
-                    "--ignore-certificate-errors-spki-list",
-                    # "--headless=new",  # Use the new headless mode
-                ]
-            }
-            
-            # Add proxy settings if a proxy is specified
-            if self.proxy:
-                proxy_settings = ProxySettings(server=self.proxy)
-                browser_args["proxy"] = proxy_settings
-            elif self.proxy_config:
-                proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password"))
-                browser_args["proxy"] = proxy_settings
-                
-            # Select the appropriate browser based on the browser_type
-            if self.browser_type == "firefox":
-                self.browser = await self.playwright.firefox.launch(**browser_args)
-            elif self.browser_type == "webkit":
-                self.browser = await self.playwright.webkit.launch(**browser_args)
+            if self.use_managed_browser:
+                # Use managed browser approach
+                self.managed_browser = ManagedBrowser(
+                    browser_type=self.browser_type,
+                    user_data_dir=self.user_data_dir,
+                    headless=self.headless
+                )
+                cdp_url = await self.managed_browser.start()
+                self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
            else:
-                self.browser = await self.playwright.chromium.launch(**browser_args)
+                browser_args = {
+                    "headless": self.headless,
+                    "args": [
+                        "--disable-gpu",
+                        "--no-sandbox",
+                        "--disable-dev-shm-usage",
+                        "--disable-blink-features=AutomationControlled",
+                        "--disable-infobars",
+                        "--window-position=0,0",
+                        "--ignore-certificate-errors",
+                        "--ignore-certificate-errors-spki-list",
+                        # "--headless=new",  # Use the new headless mode
+                    ]
+                }
+                
+                # Add proxy settings if a proxy is specified
+                if self.proxy:
+                    proxy_settings = ProxySettings(server=self.proxy)
+                    browser_args["proxy"] = proxy_settings
+                elif self.proxy_config:
+                    proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password"))
+                    browser_args["proxy"] = proxy_settings
+                    
+                # Select the appropriate browser based on the browser_type
+                if self.browser_type == "firefox":
+                    self.browser = await self.playwright.firefox.launch(**browser_args)
+                elif self.browser_type == "webkit":
+                    self.browser = await self.playwright.webkit.launch(**browser_args)
+                else:
+                    self.browser = await self.playwright.chromium.launch(**browser_args)

            await self.execute_hook('on_browser_created', self.browser)

@@ -142,6 +257,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        if self.browser:
            await self.browser.close()
            self.browser = None
+        if self.managed_browser:
+            await self.managed_browser.cleanup()
+            self.managed_browser = None
        if self.playwright:
            await self.playwright.stop()
            self.playwright = None
@@ -399,7 +517,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                status_code = 200
                response_headers = {}

-            await page.wait_for_selector('body')
+            # Replace the current wait_for_selector line with this more robust check:
+            try:
+                # First wait for body to exist, regardless of visibility
+                await page.wait_for_selector('body', state='attached', timeout=30000)
+                
+                # Then wait for it to become visible by checking CSS
+                await page.wait_for_function("""
+                    () => {
+                        const body = document.body;
+                        const style = window.getComputedStyle(body);
+                        return style.display !== 'none' && 
+                            style.visibility !== 'hidden' && 
+                            style.opacity !== '0';
+                    }
+                """, timeout=30000)
+                
+            except Error as e:
+                # If waiting fails, let's try to diagnose the issue
+                visibility_info = await page.evaluate("""
+                    () => {
+                        const body = document.body;
+                        const style = window.getComputedStyle(body);
+                        return {
+                            display: style.display,
+                            visibility: style.visibility,
+                            opacity: style.opacity,
+                            hasContent: body.innerHTML.length,
+                            classList: Array.from(body.classList)
+                        }
+                    }
+                """)
+                
+                if self.verbose:
+                    print(f"Body visibility debug info: {visibility_info}")
+                
+                # Even if body is hidden, we might still want to proceed
+                if kwargs.get('ignore_body_visibility', True):
+                    if self.verbose:
+                        print("Proceeding despite hidden body...")
+                    pass
+                else:
+                    raise Error(f"Body element is hidden: {visibility_info}")
            
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -2,18 +2,82 @@ import os
 from pathlib import Path
 import aiosqlite
 import asyncio
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Dict
+from contextlib import asynccontextmanager
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)

 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")

 class AsyncDatabaseManager:
-    def __init__(self):
+    def __init__(self, pool_size: int = 10, max_retries: int = 3):
        self.db_path = DB_PATH
+        self.pool_size = pool_size
+        self.max_retries = max_retries
+        self.connection_pool: Dict[int, aiosqlite.Connection] = {}
+        self.pool_lock = asyncio.Lock()
+        self.connection_semaphore = asyncio.Semaphore(pool_size)
+        
+    async def initialize(self):
+        """Initialize the database and connection pool"""
+        await self.ainit_db()
+        
+    async def cleanup(self):
+        """Cleanup connections when shutting down"""
+        async with self.pool_lock:
+            for conn in self.connection_pool.values():
+                await conn.close()
+            self.connection_pool.clear()
+
+    @asynccontextmanager
+    async def get_connection(self):
+        """Connection pool manager"""
+        async with self.connection_semaphore:
+            task_id = id(asyncio.current_task())
+            try:
+                async with self.pool_lock:
+                    if task_id not in self.connection_pool:
+                        conn = await aiosqlite.connect(
+                            self.db_path,
+                            timeout=30.0
+                        )
+                        await conn.execute('PRAGMA journal_mode = WAL')
+                        await conn.execute('PRAGMA busy_timeout = 5000')
+                        self.connection_pool[task_id] = conn
+                    
+                yield self.connection_pool[task_id]
+                
+            except Exception as e:
+                logger.error(f"Connection error: {e}")
+                raise
+            finally:
+                async with self.pool_lock:
+                    if task_id in self.connection_pool:
+                        await self.connection_pool[task_id].close()
+                        del self.connection_pool[task_id]
+
+    async def execute_with_retry(self, operation, *args):
+        """Execute database operations with retry logic"""
+        for attempt in range(self.max_retries):
+            try:
+                async with self.get_connection() as db:
+                    result = await operation(db, *args)
+                    await db.commit()
+                    return result
+            except Exception as e:
+                if attempt == self.max_retries - 1:
+                    logger.error(f"Operation failed after {self.max_retries} attempts: {e}")
+                    raise
+                await asyncio.sleep(1 * (attempt + 1))  # Exponential backoff

    async def ainit_db(self):
-        async with aiosqlite.connect(self.db_path) as db:
+        """Initialize database schema"""
+        async def _init(db):
            await db.execute('''
                CREATE TABLE IF NOT EXISTS crawled_data (
                    url TEXT PRIMARY KEY,
@@ -28,87 +92,101 @@ class AsyncDatabaseManager:
                    screenshot TEXT DEFAULT ""
                )
            ''')
-            await db.commit()
+        
+        await self.execute_with_retry(_init)
        await self.update_db_schema()

    async def update_db_schema(self):
-        async with aiosqlite.connect(self.db_path) as db:
-            # Check if the 'media' column exists
+        """Update database schema if needed"""
+        async def _check_columns(db):
            cursor = await db.execute("PRAGMA table_info(crawled_data)")
            columns = await cursor.fetchall()
-            column_names = [column[1] for column in columns]
-            
-            if 'media' not in column_names:
-                await self.aalter_db_add_column('media')
-            
-            # Check for other missing columns and add them if necessary
-            for column in ['links', 'metadata', 'screenshot']:
-                if column not in column_names:
-                    await self.aalter_db_add_column(column)
+            return [column[1] for column in columns]
+
+        column_names = await self.execute_with_retry(_check_columns)
+        
+        for column in ['media', 'links', 'metadata', 'screenshot']:
+            if column not in column_names:
+                await self.aalter_db_add_column(column)

    async def aalter_db_add_column(self, new_column: str):
-        try:
-            async with aiosqlite.connect(self.db_path) as db:
-                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
-                await db.commit()
-            print(f"Added column '{new_column}' to the database.")
-        except Exception as e:
-            print(f"Error altering database to add {new_column} column: {e}")
+        """Add new column to the database"""
+        async def _alter(db):
+            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+            logger.info(f"Added column '{new_column}' to the database.")
+
+        await self.execute_with_retry(_alter)

    async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
+        """Retrieve cached URL data"""
+        async def _get(db):
+            async with db.execute(
+                'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?',
+                (url,)
+            ) as cursor:
+                return await cursor.fetchone()
+
        try:
-            async with aiosqlite.connect(self.db_path) as db:
-                async with db.execute('SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', (url,)) as cursor:
-                    return await cursor.fetchone()
+            return await self.execute_with_retry(_get)
        except Exception as e:
-            print(f"Error retrieving cached URL: {e}")
+            logger.error(f"Error retrieving cached URL: {e}")
            return None

    async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""):
+        """Cache URL data with retry logic"""
+        async def _cache(db):
+            await db.execute('''
+                INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                ON CONFLICT(url) DO UPDATE SET
+                    html = excluded.html,
+                    cleaned_html = excluded.cleaned_html,
+                    markdown = excluded.markdown,
+                    extracted_content = excluded.extracted_content,
+                    success = excluded.success,
+                    media = excluded.media,      
+                    links = excluded.links,    
+                    metadata = excluded.metadata,      
+                    screenshot = excluded.screenshot
+            ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
+
        try:
-            async with aiosqlite.connect(self.db_path) as db:
-                await db.execute('''
-                    INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
-                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-                    ON CONFLICT(url) DO UPDATE SET
-                        html = excluded.html,
-                        cleaned_html = excluded.cleaned_html,
-                        markdown = excluded.markdown,
-                        extracted_content = excluded.extracted_content,
-                        success = excluded.success,
-                        media = excluded.media,      
-                        links = excluded.links,    
-                        metadata = excluded.metadata,      
-                        screenshot = excluded.screenshot
-                ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
-                await db.commit()
+            await self.execute_with_retry(_cache)
        except Exception as e:
-            print(f"Error caching URL: {e}")
+            logger.error(f"Error caching URL: {e}")

    async def aget_total_count(self) -> int:
+        """Get total number of cached URLs"""
+        async def _count(db):
+            async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
+                result = await cursor.fetchone()
+                return result[0] if result else 0
+
        try:
-            async with aiosqlite.connect(self.db_path) as db:
-                async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
-                    result = await cursor.fetchone()
-                    return result[0] if result else 0
+            return await self.execute_with_retry(_count)
        except Exception as e:
-            print(f"Error getting total count: {e}")
+            logger.error(f"Error getting total count: {e}")
            return 0

    async def aclear_db(self):
+        """Clear all data from the database"""
+        async def _clear(db):
+            await db.execute('DELETE FROM crawled_data')
+
        try:
-            async with aiosqlite.connect(self.db_path) as db:
-                await db.execute('DELETE FROM crawled_data')
-                await db.commit()
+            await self.execute_with_retry(_clear)
        except Exception as e:
-            print(f"Error clearing database: {e}")
+            logger.error(f"Error clearing database: {e}")

    async def aflush_db(self):
-        try:
-            async with aiosqlite.connect(self.db_path) as db:
-                await db.execute('DROP TABLE IF EXISTS crawled_data')
-                await db.commit()
-        except Exception as e:
-            print(f"Error flushing database: {e}")
+        """Drop the entire table"""
+        async def _flush(db):
+            await db.execute('DROP TABLE IF EXISTS crawled_data')

+        try:
+            await self.execute_with_retry(_flush)
+        except Exception as e:
+            logger.error(f"Error flushing database: {e}")
+
+# Create a singleton instance
 async_db_manager = AsyncDatabaseManager()
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -48,7 +48,8 @@ class AsyncWebCrawler:
    async def awarmup(self):
        if self.verbose:
            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
-        await async_db_manager.ainit_db()
+        # await async_db_manager.ainit_db()
+        await async_db_manager.initialize()
        await self.arun(
            url="https://google.com/",
            word_count_threshold=5,
@@ -125,6 +126,7 @@ class AsyncWebCrawler:
                verbose,
                bool(cached),
                async_response=async_response,
+                bypass_cache=bypass_cache,
                **kwargs,
            )
            crawl_result.status_code = async_response.status_code if async_response else 200
@@ -243,7 +245,7 @@ class AsyncWebCrawler:

        screenshot = None if not screenshot else screenshot

-        if not is_cached:
+        if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
            await async_db_manager.acache_url(
                url,
                html,
@@ -274,7 +276,8 @@ class AsyncWebCrawler:
        )

    async def aclear_cache(self):
-        await async_db_manager.aclear_db()
+        # await async_db_manager.aclear_db()
+        await async_db_manager.cleanup()

    async def aflush_cache(self):
        await async_db_manager.aflush_db()
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -14,12 +14,97 @@ from .utils import (
    sanitize_html,
    extract_metadata,
    InvalidCSSSelectorError,
-    CustomHTML2Text,
+    # CustomHTML2Text,
    normalize_url,
    is_external_url
    
 )

+from .html2text import HTML2Text
+class CustomHTML2Text(HTML2Text):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.inside_pre = False
+        self.inside_code = False
+        self.preserve_tags = set()  # Set of tags to preserve
+        self.current_preserved_tag = None
+        self.preserved_content = []
+        self.preserve_depth = 0
+        
+        # Configuration options
+        self.skip_internal_links = False
+        self.single_line_break = False
+        self.mark_code = False
+        self.include_sup_sub = False
+        self.body_width = 0
+        self.ignore_mailto_links = True
+        self.ignore_links = False
+        self.escape_backslash = False
+        self.escape_dot = False
+        self.escape_plus = False
+        self.escape_dash = False
+        self.escape_snob = False
+
+    def update_params(self, **kwargs):
+        """Update parameters and set preserved tags."""
+        for key, value in kwargs.items():
+            if key == 'preserve_tags':
+                self.preserve_tags = set(value)
+            else:
+                setattr(self, key, value)
+
+    def handle_tag(self, tag, attrs, start):
+        # Handle preserved tags
+        if tag in self.preserve_tags:
+            if start:
+                if self.preserve_depth == 0:
+                    self.current_preserved_tag = tag
+                    self.preserved_content = []
+                    # Format opening tag with attributes
+                    attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
+                    self.preserved_content.append(f'<{tag}{attr_str}>')
+                self.preserve_depth += 1
+                return
+            else:
+                self.preserve_depth -= 1
+                if self.preserve_depth == 0:
+                    self.preserved_content.append(f'</{tag}>')
+                    # Output the preserved HTML block with proper spacing
+                    preserved_html = ''.join(self.preserved_content)
+                    self.o('\n' + preserved_html + '\n')
+                    self.current_preserved_tag = None
+                return
+
+        # If we're inside a preserved tag, collect all content
+        if self.preserve_depth > 0:
+            if start:
+                # Format nested tags with attributes
+                attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None)
+                self.preserved_content.append(f'<{tag}{attr_str}>')
+            else:
+                self.preserved_content.append(f'</{tag}>')
+            return
+
+        # Handle pre tags
+        if tag == 'pre':
+            if start:
+                self.o('```\n')
+                self.inside_pre = True
+            else:
+                self.o('\n```')
+                self.inside_pre = False
+        elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            pass
+        else:
+            super().handle_tag(tag, attrs, start)
+
+    def handle_data(self, data, entity_char=False):
+        """Override handle_data to capture content within preserved tags."""
+        if self.preserve_depth > 0:
+            self.preserved_content.append(data)
+            return
+        super().handle_data(data, entity_char)
+
 class ContentScrappingStrategy(ABC):
    @abstractmethod
    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -178,7 +178,7 @@ def escape_json_string(s):
    
    return s

-class CustomHTML2Text(HTML2Text):
+class CustomHTML2Text_v0(HTML2Text):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.inside_pre = False
@@ -981,6 +981,19 @@ def format_html(html_string):
    return soup.prettify()

 def normalize_url(href, base_url):
+    """Normalize URLs to ensure consistent format"""
+    from urllib.parse import urljoin, urlparse
+
+    # Parse base URL to get components
+    parsed_base = urlparse(base_url)
+    if not parsed_base.scheme or not parsed_base.netloc:
+        raise ValueError(f"Invalid base URL format: {base_url}")
+
+    # Use urljoin to handle all cases
+    normalized = urljoin(base_url, href.strip())
+    return normalized
+
+def normalize_url_tmp(href, base_url):
    """Normalize URLs to ensure consistent format"""
    # Extract protocol and domain from base URL
    try: