perf(crawler): major performance improvements & raw HTML support

- Switch to lxml parser (~4x speedup) - Add raw HTML & local file crawling support - Fix cache headers & async cleanup - Add browser process monitoring - Optimize BeautifulSoup operations - Pre-compile regex patterns Breaking: Raw HTML handling requires new URL prefixes Fixes: #256, #253
2024-11-13 19:40:40 +08:00
parent 61b93ebf36
commit c38ac29edb
11 changed files with 2953 additions and 130 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -84,7 +84,7 @@ class ManagedBrowser:
                print(f"STDOUT: {stdout.decode()}")
                print(f"STDERR: {stderr.decode()}")
                await self.cleanup()
-    
+
    def _get_browser_path(self) -> str:
        """Returns the browser executable path based on OS and browser type"""
        if sys.platform == "darwin":  # macOS
@@ -493,6 +493,75 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        return page  
    
    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
+        """
+        Crawls a given URL or processes raw HTML/local file content based on the URL prefix.
+
+        Args:
+            url (str): The URL to crawl. Supported prefixes:
+                - 'http://' or 'https://': Web URL to crawl.
+                - 'file://': Local file path to process.
+                - 'raw:': Raw HTML content to process.
+            **kwargs: Additional parameters:
+                - 'screenshot' (bool): Whether to take a screenshot.
+                - ... [other existing parameters]
+
+        Returns:
+            AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
+        """
+        response_headers = {}
+        status_code = 200  # Default to 200 for local/raw HTML
+        screenshot_requested = kwargs.get('screenshot', False)
+        screenshot_data = None
+
+        if url.startswith(('http://', 'https://')):
+            # Proceed with standard web crawling
+            return await self._crawl_web(url, **kwargs)
+
+        elif url.startswith('file://'):
+            # Process local file
+            local_file_path = url[7:]  # Remove 'file://' prefix
+            if not os.path.exists(local_file_path):
+                raise FileNotFoundError(f"Local file not found: {local_file_path}")
+            with open(local_file_path, 'r', encoding='utf-8') as f:
+                html = f.read()
+            if screenshot_requested:
+                screenshot_data = await self._generate_screenshot_from_html(html)
+            return AsyncCrawlResponse(
+                html=html,
+                response_headers=response_headers,
+                status_code=status_code,
+                screenshot=screenshot_data,
+                get_delayed_content=None
+            )
+
+        elif url.startswith('raw:'):
+            # Process raw HTML content
+            raw_html = url[4:]  # Remove 'raw:' prefix
+            html = raw_html
+            if screenshot_requested:
+                screenshot_data = await self._generate_screenshot_from_html(html)
+            return AsyncCrawlResponse(
+                html=html,
+                response_headers=response_headers,
+                status_code=status_code,
+                screenshot=screenshot_data,
+                get_delayed_content=None
+            )
+        else:
+            raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'")
+
+
+    async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse:
+        """
+        Existing web crawling logic remains unchanged.
+
+        Args:
+            url (str): The web URL to crawl.
+            **kwargs: Additional parameters.
+
+        Returns:
+            AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot.
+        """
        response_headers = {}
        status_code = None
        
@@ -792,7 +861,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            if self.verbose:
                print(f"[LOG] ✅ Crawled {url} successfully!")
-
+           
            if self.use_cached_html:
                cache_file_path = os.path.join(
                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
@@ -972,6 +1041,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                print(f"Warning: Failed to remove overlay elements: {str(e)}")

    async def take_screenshot(self, page: Page) -> str:
+        """
+        Takes a screenshot of the current page.
+        
+        Args:
+            page (Page): The Playwright page instance
+            
+        Returns:
+            str: Base64-encoded screenshot image
+        """
        try:
            # The page is already loaded, just take the screenshot
            screenshot = await page.screenshot(full_page=True)
@@ -991,4 +1069,36 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return base64.b64encode(buffered.getvalue()).decode('utf-8')
        finally:
            await page.close()
+            
+    async def _generate_screenshot_from_html(self, html: str) -> Optional[str]:
+        """
+        Generates a screenshot from raw HTML content.
+
+        Args:
+            html (str): The HTML content to render and capture.
+
+        Returns:
+            Optional[str]: Base64-encoded screenshot image or an error image if failed.
+        """
+        try:
+            if not self.browser:
+                await self.start()
+            page = await self.browser.new_page()
+            await page.set_content(html, wait_until='networkidle')
+            screenshot = await page.screenshot(full_page=True)
+            await page.close()
+            return base64.b64encode(screenshot).decode('utf-8')
+        except Exception as e:
+            error_message = f"Failed to take screenshot: {str(e)}"
+            print(error_message)
+
+            # Generate an error image
+            img = Image.new('RGB', (800, 600), color='black')
+            draw = ImageDraw.Draw(img)
+            font = ImageFont.load_default()
+            draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
+
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            return base64.b64encode(buffered.getvalue()).decode('utf-8')

--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -5,6 +5,7 @@ import asyncio
 from typing import Optional, Tuple, Dict
 from contextlib import asynccontextmanager
 import logging
+import json  # Added for serialization/deserialization

 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -89,7 +90,8 @@ class AsyncDatabaseManager:
                    media TEXT DEFAULT "{}",
                    links TEXT DEFAULT "{}",
                    metadata TEXT DEFAULT "{}",
-                    screenshot TEXT DEFAULT ""
+                    screenshot TEXT DEFAULT "",
+                    response_headers TEXT DEFAULT "{}"  -- New column added
                )
            ''')
        
@@ -105,26 +107,51 @@ class AsyncDatabaseManager:

        column_names = await self.execute_with_retry(_check_columns)
        
-        for column in ['media', 'links', 'metadata', 'screenshot']:
+        # List of new columns to add
+        new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers']
+        
+        for column in new_columns:
            if column not in column_names:
                await self.aalter_db_add_column(column)

    async def aalter_db_add_column(self, new_column: str):
        """Add new column to the database"""
        async def _alter(db):
-            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+            if new_column == 'response_headers':
+                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
+            else:
+                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
            logger.info(f"Added column '{new_column}' to the database.")

        await self.execute_with_retry(_alter)

-    async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
+    async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, bool, str, str, str, str]]:
        """Retrieve cached URL data"""
        async def _get(db):
            async with db.execute(
-                'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?',
+                '''
+                SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
+                FROM crawled_data WHERE url = ?
+                ''',
                (url,)
            ) as cursor:
-                return await cursor.fetchone()
+                row = await cursor.fetchone()
+                if row:
+                    # Deserialize JSON fields
+                    return (
+                        row[0],  # url
+                        row[1],  # html
+                        row[2],  # cleaned_html
+                        row[3],  # markdown
+                        row[4],  # extracted_content
+                        row[5],  # success
+                        json.loads(row[6] or '{}'),  # media
+                        json.loads(row[7] or '{}'),  # links
+                        json.loads(row[8] or '{}'),  # metadata
+                        row[9],  # screenshot
+                        json.loads(row[10] or '{}')  # response_headers
+                    )
+                return None

        try:
            return await self.execute_with_retry(_get)
@@ -132,12 +159,27 @@ class AsyncDatabaseManager:
            logger.error(f"Error retrieving cached URL: {e}")
            return None

-    async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""):
+    async def acache_url(
+        self,
+        url: str,
+        html: str,
+        cleaned_html: str,
+        markdown: str,
+        extracted_content: str,
+        success: bool,
+        media: str = "{}",
+        links: str = "{}",
+        metadata: str = "{}",
+        screenshot: str = "",
+        response_headers: str = "{}"  # New parameter added
+    ):
        """Cache URL data with retry logic"""
        async def _cache(db):
            await db.execute('''
-                INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                INSERT INTO crawled_data (
+                    url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers
+                )
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ON CONFLICT(url) DO UPDATE SET
                    html = excluded.html,
                    cleaned_html = excluded.cleaned_html,
@@ -147,8 +189,9 @@ class AsyncDatabaseManager:
                    media = excluded.media,      
                    links = excluded.links,    
                    metadata = excluded.metadata,      
-                    screenshot = excluded.screenshot
-            ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot))
+                    screenshot = excluded.screenshot,
+                    response_headers = excluded.response_headers  -- Update response_headers
+            ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot, response_headers))

        try:
            await self.execute_with_retry(_cache)
@@ -189,4 +232,4 @@ class AsyncDatabaseManager:
            logger.error(f"Error flushing database: {e}")

 # Create a singleton instance
-async_db_manager = AsyncDatabaseManager()
+async_db_manager = AsyncDatabaseManager()
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -9,7 +9,7 @@ from .async_database import async_db_manager
 from .chunking_strategy import *
 from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
-from .content_scrapping_strategy import WebScrappingStrategy
+from .content_scrapping_strategy import WebScrapingStrategy
 from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
 from .utils import (
    sanitize_input_encode,
@@ -47,17 +47,17 @@ class AsyncWebCrawler:

    async def awarmup(self):
        # Print a message for crawl4ai and its version
+        print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
        if self.verbose:
-            print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
        # await async_db_manager.ainit_db()
        await async_db_manager.initialize()
-        # await self.arun(
-        #     url="https://google.com/",
-        #     word_count_threshold=5,
-        #     bypass_cache=False,
-        #     verbose=False,
-        # )
+        await self.arun(
+            url="https://google.com/",
+            word_count_threshold=5,
+            bypass_cache=False,
+            verbose=False,
+        )
        self.ready = True
        if self.verbose:
            print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
@@ -75,6 +75,19 @@ class AsyncWebCrawler:
        verbose=True,
        **kwargs,
    ) -> CrawlResult:
+        """
+        Runs the crawler for a single source: URL (web, local file, or raw HTML).
+
+        Args:
+            url (str): The URL to crawl. Supported prefixes:
+                - 'http://' or 'https://': Web URL to crawl.
+                - 'file://': Local file path to process.
+                - 'raw:': Raw HTML content to process.
+            ... [other existing parameters]
+
+        Returns:
+            CrawlResult: The result of the crawling and processing.
+        """
        try:
            extraction_strategy = extraction_strategy or NoExtractionStrategy()
            extraction_strategy.verbose = verbose
@@ -89,8 +102,13 @@ class AsyncWebCrawler:
            cached = None
            screenshot_data = None
            extracted_content = None
-            if not bypass_cache and not self.always_by_pass_cache:
+            
+            is_web_url = url.startswith(('http://', 'https://'))
+            if is_web_url and not bypass_cache and not self.always_by_pass_cache:
                cached = await async_db_manager.aget_cached_url(url)
+                        
+            # if not bypass_cache and not self.always_by_pass_cache:
+            #     cached = await async_db_manager.aget_cached_url(url)

            if kwargs.get("warmup", True) and not self.ready:
                return None
@@ -117,25 +135,32 @@ class AsyncWebCrawler:
                    )

            crawl_result = await self.aprocess_html(
-                url,
-                html,
-                extracted_content,
-                word_count_threshold,
-                extraction_strategy,
-                chunking_strategy,
-                css_selector,
-                screenshot_data,
-                verbose,
-                bool(cached),
+                url=url,
+                html=html,
+                extracted_content=extracted_content,
+                word_count_threshold=word_count_threshold,
+                extraction_strategy=extraction_strategy,
+                chunking_strategy=chunking_strategy,
+                css_selector=css_selector,
+                screenshot=screenshot_data,
+                verbose=verbose,
+                is_cached=bool(cached),
                async_response=async_response,
                bypass_cache=bypass_cache,
                **kwargs,
            )
-            crawl_result.status_code = async_response.status_code if async_response else 200
-            crawl_result.response_headers = async_response.response_headers if async_response else {}
+            
+            if async_response:
+                crawl_result.status_code = async_response.status_code
+                crawl_result.response_headers = async_response.response_headers
+            else:
+                crawl_result.status_code = 200
+                crawl_result.response_headers = cached[10]
+
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)
            return crawl_result
+        
        except Exception as e:
            if not hasattr(e, "msg"):
                e.msg = str(e)
@@ -155,22 +180,40 @@ class AsyncWebCrawler:
        verbose=True,
        **kwargs,
    ) -> List[CrawlResult]:
-        tasks = [
-            self.arun(
-                url,
-                word_count_threshold,
-                extraction_strategy,
-                chunking_strategy,
-                bypass_cache,
-                css_selector,
-                screenshot,
-                user_agent,
-                verbose,
-                **kwargs
-            )
-            for url in urls
-        ]
-        return await asyncio.gather(*tasks)
+        """
+        Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
+
+        Args:
+            urls (List[str]): A list of URLs with supported prefixes:
+                - 'http://' or 'https://': Web URL to crawl.
+                - 'file://': Local file path to process.
+                - 'raw:': Raw HTML content to process.
+            ... [other existing parameters]
+
+        Returns:
+            List[CrawlResult]: The results of the crawling and processing.
+        """
+        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
+        semaphore = asyncio.Semaphore(semaphore_count)
+
+        async def crawl_with_semaphore(url):
+            async with semaphore:
+                return await self.arun(
+                    url,
+                    word_count_threshold=word_count_threshold,
+                    extraction_strategy=extraction_strategy,
+                    chunking_strategy=chunking_strategy,
+                    bypass_cache=bypass_cache,
+                    css_selector=css_selector,
+                    screenshot=screenshot,
+                    user_agent=user_agent,
+                    verbose=verbose,
+                    **kwargs,
+                )
+
+        tasks = [crawl_with_semaphore(url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return [result if not isinstance(result, Exception) else str(result) for result in results]

    async def aprocess_html(
        self,
@@ -184,13 +227,14 @@ class AsyncWebCrawler:
        screenshot: str,
        verbose: bool,
        is_cached: bool,
+        async_response: Optional[AsyncCrawlResponse],
        **kwargs,
    ) -> CrawlResult:
        t = time.time()
        # Extract content from HTML
        try:
            t1 = time.time()
-            scrapping_strategy = WebScrappingStrategy()
+            scrapping_strategy = WebScrapingStrategy()
            # result = await scrapping_strategy.ascrap(
            result = scrapping_strategy.scrap(
                url,
@@ -245,6 +289,12 @@ class AsyncWebCrawler:
            )

        screenshot = None if not screenshot else screenshot
+        
+        response_headers = "{}"  # Default value
+        if async_response:
+            # Serialize response_headers dict to JSON string
+            response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
+

        if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
            await async_db_manager.acache_url(
@@ -258,6 +308,7 @@ class AsyncWebCrawler:
                json.dumps(links),
                json.dumps(metadata),
                screenshot=screenshot,
+                response_headers=response_headers,
            )

        return CrawlResult(
--- a/crawl4ai/content_cleaning_strategy.py
+++ b/crawl4ai/content_cleaning_strategy.py
@@ -15,7 +15,7 @@ class ContentCleaningStrategy:
        self.link_density_threshold = 0.2
        self.max_dom_depth = 10  # To prevent excessive DOM traversal

-    def clean(self, clean_html: str) -> str:
+    def clean(self, clean_html: str, soup = None) -> str:
        """
        Main function that takes cleaned HTML and returns super cleaned HTML.

@@ -28,18 +28,20 @@ class ContentCleaningStrategy:
        try:
            if not clean_html or not isinstance(clean_html, str):
                return ''
-            soup = BeautifulSoup(clean_html, 'html.parser')
+            if not soup:
+                # soup = BeautifulSoup(clean_html, 'html.parser')
+                soup = BeautifulSoup(clean_html, 'lxml')
            main_content = self.extract_main_content(soup)
            if main_content:
                super_clean_element = self.clean_element(main_content)
-                return str(super_clean_element)
+                return super_clean_element.encode_contents().decode('utf-8')
            else:
                return ''
        except Exception:
            # Handle exceptions silently or log them as needed
            return ''

-    def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]:
+    def extract_main_content(self, soup) -> Optional[Tag]:
        """
        Identifies and extracts the main content element from the HTML.

--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -1,3 +1,4 @@
+import re  # Point 1: Pre-Compile Regular Expressions
 from abc import ABC, abstractmethod
 from typing import Dict, Any
 from bs4 import BeautifulSoup
@@ -105,7 +106,39 @@ class CustomHTML2Text(HTML2Text):
            return
        super().handle_data(data, entity_char)

-class ContentScrappingStrategy(ABC):
+# Pre-compile regular expressions for Open Graph and Twitter metadata
+OG_REGEX = re.compile(r'^og:')
+TWITTER_REGEX = re.compile(r'^twitter:')
+DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
+
+# Function to parse image height/width value and units
+def parse_dimension(dimension):
+    if dimension:
+        # match = re.match(r"(\d+)(\D*)", dimension)
+        match = DIMENSION_REGEX.match(dimension)
+        if match:
+            number = int(match.group(1))
+            unit = match.group(2) or 'px'  # Default unit is 'px' if not specified
+            return number, unit
+    return None, None
+
+# Fetch image file metadata to extract size and extension
+def fetch_image_file_size(img, base_url):
+    #If src is relative path construct full URL, if not it may be CDN URL
+    img_url = urljoin(base_url,img.get('src'))
+    try:
+        response = requests.head(img_url)
+        if response.status_code == 200:
+            return response.headers.get('Content-Length',None)
+        else:
+            print(f"Failed to retrieve file size for {img_url}")
+            return None
+    except InvalidSchema as e:
+        return None
+    finally:
+        return
+
+class ContentScrapingStrategy(ABC):
    @abstractmethod
    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
        pass
@@ -114,7 +147,7 @@ class ContentScrappingStrategy(ABC):
    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
        pass

-class WebScrappingStrategy(ContentScrappingStrategy):
+class WebScrapingStrategy(ContentScrapingStrategy):
    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
        return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)

@@ -126,9 +159,16 @@ class WebScrappingStrategy(ContentScrappingStrategy):
        if not html:
            return None

-        soup = BeautifulSoup(html, 'html.parser')
+        # soup = BeautifulSoup(html, 'html.parser')
+        soup = BeautifulSoup(html, 'lxml')
        body = soup.body
        
+        try:
+            meta = extract_metadata("", soup)
+        except Exception as e:
+            print('Error extracting metadata:', str(e))
+            meta = {}
+        
        
        image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)

@@ -187,31 +227,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):

            #Score an image for it's usefulness
            def score_image_for_usefulness(img, base_url, index, images_count):
-                # Function to parse image height/width value and units
-                def parse_dimension(dimension):
-                    if dimension:
-                        match = re.match(r"(\d+)(\D*)", dimension)
-                        if match:
-                            number = int(match.group(1))
-                            unit = match.group(2) or 'px'  # Default unit is 'px' if not specified
-                            return number, unit
-                    return None, None

-                # Fetch image file metadata to extract size and extension
-                def fetch_image_file_size(img, base_url):
-                    #If src is relative path construct full URL, if not it may be CDN URL
-                    img_url = urljoin(base_url,img.get('src'))
-                    try:
-                        response = requests.head(img_url)
-                        if response.status_code == 200:
-                            return response.headers.get('Content-Length',None)
-                        else:
-                            print(f"Failed to retrieve file size for {img_url}")
-                            return None
-                    except InvalidSchema as e:
-                        return None
-                    finally:
-                        return

                image_height = img.get('height')
                height_value, height_unit = parse_dimension(image_height)
@@ -294,7 +310,6 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                
                exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
                exclude_social_media_domains = list(set(exclude_social_media_domains))
-
                
                try:
                    if element.name == 'a' and element.get('href'):
@@ -439,15 +454,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
            except Exception as e:
                print('Error processing element:', str(e))
                return False
-
-        #process images by filtering and extracting contextual text from the page
-        # imgs = body.find_all('img')
-        # media['images'] = [
-        #     result for result in
-        #     (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
-        #     if result is not None
-        # ]
-        
+       
        process_element(body)
        
        # Update the links dictionary with unique links
@@ -478,8 +485,9 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                # Replace base64 data with empty string
                img['src'] = base64_pattern.sub('', src)
                
+        str_body = ""
        try:
-            str(body)
+            str_body = body.encode_contents().decode('utf-8')
        except Exception as e:
            # Reset body to the original HTML
            success = False
@@ -504,11 +512,12 @@ class WebScrappingStrategy(ContentScrappingStrategy):
            
            # Append the error div to the body
            body.body.append(error_div)
+            str_body = body.encode_contents().decode('utf-8')
            
            print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")


-        cleaned_html = str(body).replace('\n\n', '\n').replace('  ', ' ')
+        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')

        try:
            h = CustomHTML2Text()
@@ -518,15 +527,14 @@ class WebScrappingStrategy(ContentScrappingStrategy):
            markdown = h.handle(sanitize_html(cleaned_html))
        markdown = markdown.replace('    ```', '```')

-        try:
-            meta = extract_metadata(html, soup)
-        except Exception as e:
-            print('Error extracting metadata:', str(e))
-            meta = {}
+        
            
-        cleaner = ContentCleaningStrategy()
-        fit_html = cleaner.clean(cleaned_html)
-        fit_markdown = h.handle(fit_html)
+        fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
+        fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
+        if kwargs.get('fit_markdown', False):
+            cleaner = ContentCleaningStrategy()
+            fit_html = cleaner.clean(cleaned_html)
+            fit_markdown = h.handle(fit_html)

        cleaned_html = sanitize_html(cleaned_html)
        return {
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -736,46 +736,54 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
        'metadata': meta
    }

-def extract_metadata(html, soup = None):
+def extract_metadata(html, soup=None):
    metadata = {}
    
-    if not html:
+    if not html and not soup:
+        return {}
+    
+    if not soup:
+        soup = BeautifulSoup(html, 'lxml')
+    
+    head = soup.head
+    if not head:
        return metadata
    
-    # Parse HTML content with BeautifulSoup
-    if not soup:
-        soup = BeautifulSoup(html, 'html.parser')
-
    # Title
-    title_tag = soup.find('title')
-    metadata['title'] = title_tag.string if title_tag else None
+    title_tag = head.find('title')
+    metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None

    # Meta description
-    description_tag = soup.find('meta', attrs={'name': 'description'})
-    metadata['description'] = description_tag['content'] if description_tag else None
+    description_tag = head.find('meta', attrs={'name': 'description'})
+    metadata['description'] = description_tag.get('content', '').strip() if description_tag else None

    # Meta keywords
-    keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
-    metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
+    keywords_tag = head.find('meta', attrs={'name': 'keywords'})
+    metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None

    # Meta author
-    author_tag = soup.find('meta', attrs={'name': 'author'})
-    metadata['author'] = author_tag['content'] if author_tag else None
+    author_tag = head.find('meta', attrs={'name': 'author'})
+    metadata['author'] = author_tag.get('content', '').strip() if author_tag else None

    # Open Graph metadata
-    og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
+    og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')})
    for tag in og_tags:
-        property_name = tag['property']
-        metadata[property_name] = tag['content']
+        property_name = tag.get('property', '').strip()
+        content = tag.get('content', '').strip()
+        if property_name and content:
+            metadata[property_name] = content

    # Twitter Card metadata
-    twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
+    twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')})
    for tag in twitter_tags:
-        property_name = tag['name']
-        metadata[property_name] = tag['content']
-
+        property_name = tag.get('name', '').strip()
+        content = tag.get('content', '').strip()
+        if property_name and content:
+            metadata[property_name] = content
+    
    return metadata

+
 def extract_xml_tags(string):
    tags = re.findall(r'<(\w+)>', string)
    return list(set(tags))
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -10,7 +10,7 @@ from .extraction_strategy import *
 from .crawler_strategy import *
 from typing import List
 from concurrent.futures import ThreadPoolExecutor
-from .content_scrapping_strategy import WebScrappingStrategy
+from .content_scrapping_strategy import WebScrapingStrategy
 from .config import *
 import warnings
 import json
@@ -182,7 +182,7 @@ class WebCrawler:
            # Extract content from HTML
            try:
                t1 = time.time()
-                scrapping_strategy = WebScrappingStrategy()
+                scrapping_strategy = WebScrapingStrategy()
                extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]}
                result = scrapping_strategy.scrap(
                    url,