fix(metadata): improve title extraction with fallbacks for edge cases. REF #995

Some pages include a <title> tag in HTML but lxml fails to parse it due to unusual structure. Added fallback logic using .find() and OpenGraph/Twitter meta tags to ensure reliable title extraction.
Refactor content scraping strategies: comment out WebScrapingStrategy references and update to use LXMLWebScrapingStrategy across multiple files. Bring WebScrapingStrategy methods to LXMLWebScrapingStrategy
2025-05-28 10:17:50 +02:00 · 2025-05-27 17:32:45 +02:00
8 changed files with 113 additions and 859 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -6,7 +6,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, L

 from .content_scraping_strategy import (
    ContentScrapingStrategy,
-    WebScrapingStrategy,
+    # WebScrapingStrategy,
    LXMLWebScrapingStrategy,
 )
 from .async_logger import (
@@ -100,7 +100,7 @@ __all__ = [
    "CrawlerHub",
    "CacheMode",
    "ContentScrapingStrategy",
-    "WebScrapingStrategy",
+    # "WebScrapingStrategy",
    "LXMLWebScrapingStrategy",
    "BrowserConfig",
    "CrawlerRunConfig",
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -17,7 +17,7 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking

 from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
-from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
+from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
 from .deep_crawling import DeepCrawlStrategy

 from .cache_context import CacheMode
@@ -725,7 +725,7 @@ class CrawlerRunConfig():
        parser_type (str): Type of parser to use for HTML parsing.
                           Default: "lxml".
        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
-                           Default: WebScrapingStrategy.
+                           Default: LXMLWebScrapingStrategy.
        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.

@@ -979,7 +979,7 @@ class CrawlerRunConfig():
        self.remove_forms = remove_forms
        self.prettiify = prettiify
        self.parser_type = parser_type
-        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
+        self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
        self.proxy_config = proxy_config
        self.proxy_rotation_strategy = proxy_rotation_strategy
        
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -2,7 +2,7 @@ import re
 from itertools import chain
 from abc import ABC, abstractmethod
 from typing import Dict, Any, Optional
-from bs4 import BeautifulSoup
+# from bs4 import BeautifulSoup
 import asyncio
 import requests
 from .config import (
@@ -13,12 +13,12 @@ from .config import (
    IMPORTANT_ATTRS,
    SOCIAL_MEDIA_DOMAINS,
 )
-from bs4 import NavigableString, Comment
-from bs4 import PageElement, Tag
+# from bs4 import NavigableString, Comment
+# from bs4 import PageElement, Tag
 from urllib.parse import urljoin
 from requests.exceptions import InvalidSchema
 from .utils import (
-    extract_metadata,
+    # extract_metadata,
    normalize_url,
    is_external_url,
    get_base_domain,
@@ -96,20 +96,16 @@ class ContentScrapingStrategy(ABC):
        pass


-class WebScrapingStrategy(ContentScrapingStrategy):
-    """
-    Class for web content scraping. Perhaps the most important class.
-
-    How it works:
-    1. Extract content from HTML using BeautifulSoup.
-    2. Clean the extracted content using a content cleaning strategy.
-    3. Filter the cleaned content using a content filtering strategy.
-    4. Generate markdown content from the filtered content.
-    5. Return the markdown content.
-    """
-
+class LXMLWebScrapingStrategy(ContentScrapingStrategy):
    def __init__(self, logger=None):
        self.logger = logger
+        self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
+        self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
+        
+        # Constants for image processing
+        self.classes_to_check = frozenset(["button", "icon", "logo"])
+        self.tags_to_check = frozenset(["button", "input"])
+        self.image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])

    def _log(self, level, message, tag="SCRAPE", **kwargs):
        """Helper method to safely use logger."""
@@ -130,7 +126,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            ScrapingResult: A structured result containing the scraped content.
        """
        actual_url = kwargs.get("redirected_url", url)
-        raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
+        raw_result = self._scrap(actual_url, html, **kwargs)
+        
        if raw_result is None:
            return ScrapingResult(
                cleaned_html="",
@@ -194,388 +191,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        Returns:
            ScrapingResult: A structured result containing the scraped content.
        """
-        return await asyncio.to_thread(self._scrap, url, html, **kwargs)
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)

-    def is_data_table(self, table: Tag, **kwargs) -> bool:
-        """
-        Determine if a table element is a data table (not a layout table).
-
-        Args:
-            table (Tag): BeautifulSoup Tag representing a table element
-            **kwargs: Additional keyword arguments including table_score_threshold
-
-        Returns:
-            bool: True if the table is a data table, False otherwise
-        """
-        score = 0
-        
-        # Check for thead and tbody
-        has_thead = len(table.select('thead')) > 0
-        has_tbody = len(table.select('tbody')) > 0
-        if has_thead:
-            score += 2
-        if has_tbody:
-            score += 1
-            
-        # Check for th elements
-        th_count = len(table.select('th'))
-        if th_count > 0:
-            score += 2
-            if has_thead or len(table.select('tr:first-child th')) > 0:
-                score += 1
-                
-        # Check for nested tables
-        if len(table.select('table')) > 0:
-            score -= 3
-            
-        # Role attribute check
-        role = table.get('role', '').lower()
-        if role in {'presentation', 'none'}:
-            score -= 3
-            
-        # Column consistency
-        rows = table.select('tr')
-        if not rows:
-            return False
-            
-        col_counts = [len(row.select('td, th')) for row in rows]
-        avg_cols = sum(col_counts) / len(col_counts)
-        variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
-        if variance < 1:
-            score += 2
-            
-        # Caption and summary
-        if table.select('caption'):
-            score += 2
-        if table.has_attr('summary') and table['summary']:
-            score += 1
-            
-        # Text density
-        total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
-        total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
-        text_ratio = total_text / (total_tags + 1e-5)
-        if text_ratio > 20:
-            score += 3
-        elif text_ratio > 10:
-            score += 2
-            
-        # Data attributes
-        data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
-        score += data_attrs * 0.5
-        
-        # Size check
-        if avg_cols >= 2 and len(rows) >= 2:
-            score += 2
-            
-        threshold = kwargs.get('table_score_threshold', 7)
-        return score >= threshold
-    
-    def extract_table_data(self, table: Tag) -> dict:
-        """
-        Extract structured data from a table element.
-        
-        Args:
-            table (Tag): BeautifulSoup Tag representing a table element
-            
-        Returns:
-            dict: Dictionary containing table data (headers, rows, caption, summary)
-        """
-        caption_elem = table.select_one('caption')
-        caption = caption_elem.get_text().strip() if caption_elem else ""
-        summary = table.get('summary', '').strip()
-        
-        # Extract headers with colspan handling
-        headers = []
-        thead_rows = table.select('thead tr')
-        if thead_rows:
-            header_cells = thead_rows[0].select('th')
-            for cell in header_cells:
-                text = cell.get_text().strip()
-                colspan = int(cell.get('colspan', 1))
-                headers.extend([text] * colspan)
-        else:
-            first_row = table.select('tr:first-child')
-            if first_row:
-                for cell in first_row[0].select('th, td'):
-                    text = cell.get_text().strip()
-                    colspan = int(cell.get('colspan', 1))
-                    headers.extend([text] * colspan)
-        
-        # Extract rows with colspan handling
-        rows = []
-        all_rows = table.select('tr')
-        thead = table.select_one('thead')
-        tbody_rows = []
-
-        if thead:
-            thead_rows = thead.select('tr')
-            tbody_rows = [row for row in all_rows if row not in thead_rows]
-        else:
-            if all_rows and all_rows[0].select('th'):
-                tbody_rows = all_rows[1:]
-            else:
-                tbody_rows = all_rows
-                
-        for row in tbody_rows:        
-        # for row in table.select('tr:not(:has(ancestor::thead))'):
-            row_data = []
-            for cell in row.select('td'):
-                text = cell.get_text().strip()
-                colspan = int(cell.get('colspan', 1))
-                row_data.extend([text] * colspan)
-            if row_data:
-                rows.append(row_data)
-                
-        # Align rows with headers
-        max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
-        aligned_rows = []
-        for row in rows:
-            aligned = row[:max_columns] + [''] * (max_columns - len(row))
-            aligned_rows.append(aligned)
-            
-        if not headers:
-            headers = [f"Column {i+1}" for i in range(max_columns)]
-            
-        return {
-            "headers": headers,
-            "rows": aligned_rows,
-            "caption": caption,
-            "summary": summary,
-        }
-    
-    def flatten_nested_elements(self, node):
-        """
-        Flatten nested elements in a HTML tree.
-
-        Args:
-            node (Tag): The root node of the HTML tree.
-
-        Returns:
-            Tag: The flattened HTML tree.
-        """
-        if isinstance(node, NavigableString):
-            return node
-        if (
-            len(node.contents) == 1
-            and isinstance(node.contents[0], Tag)
-            and node.contents[0].name == node.name
-        ):
-            return self.flatten_nested_elements(node.contents[0])
-        node.contents = [self.flatten_nested_elements(child) for child in node.contents]
-        return node
-
-    def find_closest_parent_with_useful_text(self, tag, **kwargs):
-        """
-        Find the closest parent with useful text.
-
-        Args:
-            tag (Tag): The starting tag to search from.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            Tag: The closest parent with useful text, or None if not found.
-        """
-        image_description_min_word_threshold = kwargs.get(
-            "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
-        )
-        current_tag = tag
-        while current_tag:
-            current_tag = current_tag.parent
-            # Get the text content of the parent tag
-            if current_tag:
-                text_content = current_tag.get_text(separator=" ", strip=True)
-                # Check if the text content has at least word_count_threshold
-                if len(text_content.split()) >= image_description_min_word_threshold:
-                    return text_content
-        return None
-
-    def remove_unwanted_attributes(
-        self, element, important_attrs, keep_data_attributes=False
-    ):
-        """
-        Remove unwanted attributes from an HTML element.
-
-        Args:
-            element (Tag): The HTML element to remove attributes from.
-            important_attrs (list): List of important attributes to keep.
-            keep_data_attributes (bool): Whether to keep data attributes.
-
-        Returns:
-            None
-        """
-        attrs_to_remove = []
-        for attr in element.attrs:
-            if attr not in important_attrs:
-                if keep_data_attributes:
-                    if not attr.startswith("data-"):
-                        attrs_to_remove.append(attr)
-                else:
-                    attrs_to_remove.append(attr)
-
-        for attr in attrs_to_remove:
-            del element[attr]
-
-    def process_image(self, img, url, index, total_images, **kwargs):
-        """
-        Process an image element.
-
-        How it works:
-        1. Check if the image has valid display and inside undesired html elements.
-        2. Score an image for it's usefulness.
-        3. Extract image file metadata to extract size and extension.
-        4. Generate a dictionary with the processed image information.
-        5. Return the processed image information.
-
-        Args:
-            img (Tag): The image element to process.
-            url (str): The URL of the page containing the image.
-            index (int): The index of the image in the list of images.
-            total_images (int): The total number of images in the list.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            dict: A dictionary containing the processed image information.
-        """
-        # parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
-        #                 if ' ' in u else None}
-        #                 for u in [f"http{p}" for p in s.split("http") if p]]
-
-        # Constants for checks
-        classes_to_check = frozenset(["button", "icon", "logo"])
-        tags_to_check = frozenset(["button", "input"])
-        image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])
-
-        # Pre-fetch commonly used attributes
-        style = img.get("style", "")
-        alt = img.get("alt", "")
-        src = img.get("src", "")
-        data_src = img.get("data-src", "")
-        srcset = img.get("srcset", "")
-        data_srcset = img.get("data-srcset", "")
-        width = img.get("width")
-        height = img.get("height")
-        parent = img.parent
-        parent_classes = parent.get("class", [])
-
-        # Quick validation checks
-        if (
-            "display:none" in style
-            or parent.name in tags_to_check
-            or any(c in cls for c in parent_classes for cls in classes_to_check)
-            or any(c in src for c in classes_to_check)
-            or any(c in alt for c in classes_to_check)
-        ):
-            return None
-
-        # Quick score calculation
-        score = 0
-        if width and width.isdigit():
-            width_val = int(width)
-            score += 1 if width_val > 150 else 0
-        if height and height.isdigit():
-            height_val = int(height)
-            score += 1 if height_val > 150 else 0
-        if alt:
-            score += 1
-        score += index / total_images < 0.5
-
-        # image_format = ''
-        # if "data:image/" in src:
-        #     image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
-        # else:
-        #     image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
-
-        # if image_format in ('jpg', 'png', 'webp', 'avif'):
-        #     score += 1
-
-        # Check for image format in all possible sources
-        def has_image_format(url):
-            return any(fmt in url.lower() for fmt in image_formats)
-
-        # Score for having proper image sources
-        if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
-            score += 1
-        if srcset or data_srcset:
-            score += 1
-        if img.find_parent("picture"):
-            score += 1
-
-        # Detect format from any available source
-        detected_format = None
-        for url in [src, data_src, srcset, data_srcset]:
-            if url:
-                format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
-                if format_matches:
-                    detected_format = format_matches[0]
-                    break
-
-        if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
-            return None
-
-        # Use set for deduplication
-        unique_urls = set()
-        image_variants = []
-
-        # Generate a unique group ID for this set of variants
-        group_id = index
-
-        # Base image info template
-        base_info = {
-            "alt": alt,
-            "desc": self.find_closest_parent_with_useful_text(img, **kwargs),
-            "score": score,
-            "type": "image",
-            "group_id": group_id,  # Group ID for this set of variants
-            "format": detected_format,
-        }
-
-        # Inline function for adding variants
-        def add_variant(src, width=None):
-            if src and not src.startswith("data:") and src not in unique_urls:
-                unique_urls.add(src)
-                image_variants.append({**base_info, "src": src, "width": width})
-
-        # Process all sources
-        add_variant(src)
-        add_variant(data_src)
-
-        # Handle srcset and data-srcset in one pass
-        for attr in ("srcset", "data-srcset"):
-            if value := img.get(attr):
-                for source in parse_srcset(value):
-                    add_variant(source["url"], source["width"])
-
-        # Quick picture element check
-        if picture := img.find_parent("picture"):
-            for source in picture.find_all("source"):
-                if srcset := source.get("srcset"):
-                    for src in parse_srcset(srcset):
-                        add_variant(src["url"], src["width"])
-
-        # Framework-specific attributes in one pass
-        for attr, value in img.attrs.items():
-            if (
-                attr.startswith("data-")
-                and ("src" in attr or "srcset" in attr)
-                and "http" in value
-            ):
-                add_variant(value)
-
-        return image_variants if image_variants else None
-
-    def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
+    def process_element(self, url: str, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]:
        """
        Process an HTML element.

-        How it works:
-        1. Check if the element is an image, video, or audio.
-        2. Extract the element's attributes and content.
-        3. Process the element based on its type.
-        4. Return the processed element information.
-
        Args:
            url (str): The URL of the page containing the element.
-            element (Tag): The HTML element to process.
+            element (lhtml.HtmlElement): The HTML element to process.
            **kwargs: Additional keyword arguments.

        Returns:
@@ -584,451 +208,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        media = {"images": [], "videos": [], "audios": [], "tables": []}
        internal_links_dict = {}
        external_links_dict = {}
+        
        self._process_element(
            url, element, media, internal_links_dict, external_links_dict, **kwargs
        )
+        
        return {
            "media": media,
            "internal_links_dict": internal_links_dict,
            "external_links_dict": external_links_dict,
        }

-    def _process_element(
-        self,
-        url,
-        element: PageElement,
-        media: Dict[str, Any],
-        internal_links_dict: Dict[str, Any],
-        external_links_dict: Dict[str, Any],
-        **kwargs,
-    ) -> bool:
+    def remove_unwanted_attributes(self, element: lhtml.HtmlElement, important_attrs: List[str], keep_data_attributes: bool = False):
        """
-        Process an HTML element.
-        """
-        try:
-            if isinstance(element, NavigableString):
-                if isinstance(element, Comment):
-                    element.extract()
-                return False
-
-            # if element.name == 'img':
-            #     process_image(element, url, 0, 1)
-            #     return True
-            base_domain = kwargs.get("base_domain", get_base_domain(url))
-
-            if element.name in ["script", "style", "link", "meta", "noscript"]:
-                element.decompose()
-                return False
-
-            keep_element = False
-            # Special case for table elements - always preserve structure
-            if element.name in ["tr", "td", "th"]:
-                keep_element = True
-
-            exclude_domains = kwargs.get("exclude_domains", [])
-            # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
-            # exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
-            # exclude_social_media_domains = list(set(exclude_social_media_domains))
-
-            try:
-                if element.name == "a" and element.get("href"):
-                    href = element.get("href", "").strip()
-                    if not href:  # Skip empty hrefs
-                        return False
-
-                    # url_base = url.split("/")[2]
-
-                    # Normalize the URL
-                    try:
-                        normalized_href = normalize_url(href, url)
-                    except ValueError:
-                        # logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
-                        return False
-
-                    link_data = {
-                        "href": normalized_href,
-                        "text": element.get_text().strip(),
-                        "title": element.get("title", "").strip(),
-                        "base_domain": base_domain,
-                    }
-
-                    is_external = is_external_url(normalized_href, base_domain)
-
-                    keep_element = True
-
-                    # Handle external link exclusions
-                    if is_external:
-                        link_base_domain = get_base_domain(normalized_href)
-                        link_data["base_domain"] = link_base_domain
-                        if kwargs.get("exclude_external_links", False):
-                            element.decompose()
-                            return False
-                        # elif kwargs.get('exclude_social_media_links', False):
-                        #     if link_base_domain in exclude_social_media_domains:
-                        #         element.decompose()
-                        #         return False
-                        # if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
-                        #     element.decompose()
-                        #     return False
-                        elif exclude_domains:
-                            if link_base_domain in exclude_domains:
-                                element.decompose()
-                                return False
-                            # if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
-                            #     element.decompose()
-                            #     return False
-
-                    if is_external:
-                        if normalized_href not in external_links_dict:
-                            external_links_dict[normalized_href] = link_data
-                    else:
-                        if kwargs.get("exclude_internal_links", False):
-                            element.decompose()
-                            return False
-                        if normalized_href not in internal_links_dict:
-                            internal_links_dict[normalized_href] = link_data
-
-            except Exception as e:
-                raise Exception(f"Error processing links: {str(e)}")
-
-            try:
-                if element.name == "img":
-                    potential_sources = [
-                        "src",
-                        "data-src",
-                        "srcset" "data-lazy-src",
-                        "data-original",
-                    ]
-                    src = element.get("src", "")
-                    while not src and potential_sources:
-                        src = element.get(potential_sources.pop(0), "")
-                    if not src:
-                        element.decompose()
-                        return False
-
-                    # If it is srcset pick up the first image
-                    if "srcset" in element.attrs:
-                        src = element.attrs["srcset"].split(",")[0].split(" ")[0]
-
-                    # If image src is internal, then skip
-                    if not is_external_url(src, base_domain):
-                        return True
-
-                    image_src_base_domain = get_base_domain(src)
-
-                    # Check flag if we should remove external images
-                    if kwargs.get("exclude_external_images", False):
-                        element.decompose()
-                        return False
-                        # src_url_base = src.split('/')[2]
-                        # url_base = url.split('/')[2]
-                        # if url_base not in src_url_base:
-                        #     element.decompose()
-                        #     return False
-
-                    # if kwargs.get('exclude_social_media_links', False):
-                    #     if image_src_base_domain in exclude_social_media_domains:
-                    #         element.decompose()
-                    #         return False
-                    # src_url_base = src.split('/')[2]
-                    # url_base = url.split('/')[2]
-                    # if any(domain in src for domain in exclude_social_media_domains):
-                    #     element.decompose()
-                    #     return False
-
-                    # Handle exclude domains
-                    if exclude_domains:
-                        if image_src_base_domain in exclude_domains:
-                            element.decompose()
-                            return False
-                        # if any(domain in src for domain in kwargs.get('exclude_domains', [])):
-                        #     element.decompose()
-                        #     return False
-
-                    return True  # Always keep image elements
-            except Exception:
-                raise "Error processing images"
-
-            # Check if flag to remove all forms is set
-            if kwargs.get("remove_forms", False) and element.name == "form":
-                element.decompose()
-                return False
-
-            if element.name in ["video", "audio"]:
-                media[f"{element.name}s"].append(
-                    {
-                        "src": element.get("src"),
-                        "alt": element.get("alt"),
-                        "type": element.name,
-                        "description": self.find_closest_parent_with_useful_text(
-                            element, **kwargs
-                        ),
-                    }
-                )
-                source_tags = element.find_all("source")
-                for source_tag in source_tags:
-                    media[f"{element.name}s"].append(
-                        {
-                            "src": source_tag.get("src"),
-                            "alt": element.get("alt"),
-                            "type": element.name,
-                            "description": self.find_closest_parent_with_useful_text(
-                                element, **kwargs
-                            ),
-                        }
-                    )
-                return True  # Always keep video and audio elements
-
-            if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
-                if kwargs.get("only_text", False):
-                    element.replace_with(element.get_text())
-
-            try:
-                self.remove_unwanted_attributes(
-                    element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
-                )
-            except Exception as e:
-                # print('Error removing unwanted attributes:', str(e))
-                self._log(
-                    "error",
-                    message="Error removing unwanted attributes: {error}",
-                    tag="SCRAPE",
-                    params={"error": str(e)},
-                )
-            # Process children
-            for child in list(element.children):
-                if isinstance(child, NavigableString) and not isinstance(
-                    child, Comment
-                ):
-                    if len(child.strip()) > 0:
-                        keep_element = True
-                else:
-                    if self._process_element(
-                        url,
-                        child,
-                        media,
-                        internal_links_dict,
-                        external_links_dict,
-                        **kwargs,
-                    ):
-                        keep_element = True
-
-            # Check word count
-            word_count_threshold = kwargs.get(
-                "word_count_threshold", MIN_WORD_THRESHOLD
-            )
-            if not keep_element:
-                word_count = len(element.get_text(strip=True).split())
-                keep_element = word_count >= word_count_threshold
-
-            if not keep_element:
-                element.decompose()
-
-            return keep_element
-        except Exception as e:
-            # print('Error processing element:', str(e))
-            self._log(
-                "error",
-                message="Error processing element: {error}",
-                tag="SCRAPE",
-                params={"error": str(e)},
-            )
-            return False
-
-    def _scrap(
-        self,
-        url: str,
-        html: str,
-        word_count_threshold: int = MIN_WORD_THRESHOLD,
-        css_selector: str = None,
-        target_elements: List[str] = None,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        """
-        Extract content from HTML using BeautifulSoup.
+        Remove unwanted attributes from an HTML element.

        Args:
-            url (str): The URL of the page to scrape.
-            html (str): The HTML content of the page to scrape.
-            word_count_threshold (int): The minimum word count threshold for content extraction.
-            css_selector (str): The CSS selector to use for content extraction.
-            **kwargs: Additional keyword arguments.
+            element (lhtml.HtmlElement): The HTML element to remove attributes from.
+            important_attrs (List[str]): List of important attributes to keep.
+            keep_data_attributes (bool): Whether to keep data attributes.

        Returns:
-            dict: A dictionary containing the extracted content.
+            None
        """
-        success = True
-        if not html:
-            return None
+        attrs_to_remove = []
+        for attr in element.attrib:
+            if attr not in important_attrs:
+                if keep_data_attributes:
+                    if not attr.startswith("data-"):
+                        attrs_to_remove.append(attr)
+                else:
+                    attrs_to_remove.append(attr)

-        parser_type = kwargs.get("parser", "lxml")
-        soup = BeautifulSoup(html, parser_type)
-        body = soup.body
-        if body is None:
-            raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
-        base_domain = get_base_domain(url)
-        
-        # Early removal of all images if exclude_all_images is set
-        # This happens before any processing to minimize memory usage
-        if kwargs.get("exclude_all_images", False):
-            for img in body.find_all('img'):
-                img.decompose()
-
-        try:
-            meta = extract_metadata("", soup)
-        except Exception as e:
-            self._log(
-                "error",
-                message="Error extracting metadata: {error}",
-                tag="SCRAPE",
-                params={"error": str(e)},
-            )
-            meta = {}
-
-        # Handle tag-based removal first - faster than CSS selection
-        excluded_tags = set(kwargs.get("excluded_tags", []) or [])
-        if excluded_tags:
-            for element in body.find_all(lambda tag: tag.name in excluded_tags):
-                element.extract()
-
-        # Handle CSS selector-based removal
-        excluded_selector = kwargs.get("excluded_selector", "")
-        if excluded_selector:
-            is_single_selector = (
-                "," not in excluded_selector and " " not in excluded_selector
-            )
-            if is_single_selector:
-                while element := body.select_one(excluded_selector):
-                    element.extract()
-            else:
-                for element in body.select(excluded_selector):
-                    element.extract()
-
-        content_element = None
-        if target_elements:
-            try:
-                for_content_targeted_element = []
-                for target_element in target_elements:
-                    for_content_targeted_element.extend(body.select(target_element))
-                content_element = soup.new_tag("div")
-                for el in for_content_targeted_element:
-                    content_element.append(copy.deepcopy(el))
-            except Exception as e:
-                self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
-                return None
-        else:
-            content_element = body     
-
-        kwargs["exclude_social_media_domains"] = set(
-            kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
-        )
-        kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
-        if kwargs.get("exclude_social_media_links", False):
-            kwargs["exclude_domains"] = kwargs["exclude_domains"].union(
-                kwargs["exclude_social_media_domains"]
-            )
-
-        result_obj = self.process_element(
-            url,
-            body,
-            word_count_threshold=word_count_threshold,
-            base_domain=base_domain,
-            **kwargs,
-        )
-
-        links = {"internal": [], "external": []}
-        media = result_obj["media"]
-        internal_links_dict = result_obj["internal_links_dict"]
-        external_links_dict = result_obj["external_links_dict"]
-
-        # Update the links dictionary with unique links
-        links["internal"] = list(internal_links_dict.values())
-        links["external"] = list(external_links_dict.values())
-
-        # # Process images using ThreadPoolExecutor
-        imgs = body.find_all("img")
-
-        media["images"] = [
-            img
-            for result in (
-                self.process_image(img, url, i, len(imgs), **kwargs)
-                for i, img in enumerate(imgs)
-            )
-            if result is not None
-            for img in result
-        ]
-        
-        # Process tables if not excluded
-        excluded_tags = set(kwargs.get("excluded_tags", []) or [])
-        if 'table' not in excluded_tags:
-            tables = body.find_all('table')
-            for table in tables:
-                if self.is_data_table(table, **kwargs):
-                    table_data = self.extract_table_data(table)
-                    media["tables"].append(table_data)
-
-        body = self.flatten_nested_elements(body)
-        base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
-        for img in imgs:
-            src = img.get("src", "")
-            if base64_pattern.match(src):
-                # Replace base64 data with empty string
-                img["src"] = base64_pattern.sub("", src)
-
-        str_body = ""
-        try:
-            str_body = content_element.encode_contents().decode("utf-8")
-        except Exception:
-            # Reset body to the original HTML
-            success = False
-            body = BeautifulSoup(html, "html.parser")
-
-            # Create a new div with a special ID
-            error_div = body.new_tag("div", id="crawl4ai_error_message")
-            error_div.string = """
-            Crawl4AI Error: This page is not fully supported.
-            
-            Possible reasons:
-            1. The page may have restrictions that prevent crawling.
-            2. The page might not be fully loaded.
-            
-            Suggestions:
-            - Try calling the crawl function with these parameters:
-            magic=True,
-            - Set headless=False to visualize what's happening on the page.
-            
-            If the issue persists, please check the page's structure and any potential anti-crawling measures.
-            """
-
-            # Append the error div to the body
-            body.append(error_div)
-            str_body = body.encode_contents().decode("utf-8")
-
-            print(
-                "[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details."
-            )
-            self._log(
-                "error",
-                message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
-                tag="SCRAPE",
-            )
-
-        cleaned_html = str_body.replace("\n\n", "\n").replace("  ", " ")
-
-        return {
-            "cleaned_html": cleaned_html,
-            "success": success,
-            "media": media,
-            "links": links,
-            "metadata": meta,
-        }
-
-
-class LXMLWebScrapingStrategy(WebScrapingStrategy):
-    def __init__(self, logger=None):
-        super().__init__(logger)
-        self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
-        self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
+        for attr in attrs_to_remove:
+            del element.attrib[attr]

    def _process_element(
        self,
@@ -1190,7 +403,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            return None

        parent = img.getparent()
-        if parent.tag in ["button", "input"]:
+        if parent.tag in self.tags_to_check:
            return None

        parent_classes = parent.get("class", "").split()
@@ -1200,8 +413,8 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            return None

        # If src is in class or alt, likely an icon
-        if (src and any(c in src for c in ["button", "icon", "logo"])) or (
-            alt and any(c in alt for c in ["button", "icon", "logo"])
+        if (src and any(c in src for c in self.classes_to_check)) or (
+            alt and any(c in alt for c in self.classes_to_check)
        ):
            return None

@@ -1216,11 +429,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
        score += index / total_images < 0.5

        # Check formats in all possible sources
-        image_formats = {"jpg", "jpeg", "png", "webp", "avif", "gif"}
        detected_format = None
        for url in [src, data_src, srcset, data_srcset]:
            if url:
-                format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
+                format_matches = [fmt for fmt in self.image_formats if fmt in url.lower()]
                if format_matches:
                    detected_format = format_matches[0]
                    score += 1
@@ -1484,6 +696,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):

        success = True
        try:
+            # Extract metadata FIRST from the original HTML to avoid issues with modified content.
+            try:
+                meta = extract_metadata_using_lxml(html, None)  # Pass the original HTML
+            except Exception as e:
+                self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
+                meta = {}
+                
            doc = lhtml.document_fromstring(html)
            # Match BeautifulSoup's behavior of using body or full doc
            # body = doc.xpath('//body')[0] if doc.xpath('//body') else doc
@@ -1524,14 +743,14 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                        "error", f"Error with excluded CSS selector: {str(e)}", "SCRAPE"
                    )

-            # Extract metadata before any content filtering
-            try:
-                meta = extract_metadata_using_lxml(
-                    "", doc
-                )  # Using same function as BeautifulSoup version
-            except Exception as e:
-                self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
-                meta = {}
+            # # Extract metadata before any content filtering
+            # try:
+            #     meta = extract_metadata_using_lxml(
+            #         "", doc
+            #     )  # Using same function as BeautifulSoup version
+            # except Exception as e:
+            #     self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
+            #     meta = {}

            content_element = None
            if target_elements:
@@ -1611,7 +830,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):

            # Remove unneeded attributes
            self.remove_unwanted_attributes_fast(
-                body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
+                body, 
+                important_attrs=IMPORTANT_ATTRS + kwargs.get("keep_attrs", []),
+                keep_data_attributes=kwargs.get("keep_data_attributes", False)
            )

            # Generate output HTML
--- a/crawl4ai/types.py
+++ b/crawl4ai/types.py
@@ -19,7 +19,7 @@ LLMConfig = Union['LLMConfigType']

 # Content scraping types
 ContentScrapingStrategy = Union['ContentScrapingStrategyType']
-WebScrapingStrategy = Union['WebScrapingStrategyType']
+# WebScrapingStrategy = Union['WebScrapingStrategyType']
 LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']

 # Proxy types
@@ -106,7 +106,7 @@ if TYPE_CHECKING:
    # Content scraping imports
    from .content_scraping_strategy import (
        ContentScrapingStrategy as ContentScrapingStrategyType,
-        WebScrapingStrategy as WebScrapingStrategyType,
+        # WebScrapingStrategy as WebScrapingStrategyType,
        LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
    )
    
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1487,8 +1487,29 @@ def extract_metadata_using_lxml(html, doc=None):
    head = head[0]

    # Title - using XPath
+    # title = head.xpath(".//title/text()")
+    # metadata["title"] = title[0].strip() if title else None
+
+    # === Title Extraction - New Approach ===
+    # Attempt to extract <title> using XPath
    title = head.xpath(".//title/text()")
-    metadata["title"] = title[0].strip() if title else None
+    title = title[0] if title else None
+
+    # Fallback: Use .find() in case XPath fails due to malformed HTML
+    if not title:
+        title_el = doc.find(".//title")
+        title = title_el.text if title_el is not None else None
+
+    # Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
+    if not title:
+        title_candidates = (
+            doc.xpath("//meta[@property='og:title']/@content") or
+            doc.xpath("//meta[@name='twitter:title']/@content")
+        )
+        title = title_candidates[0] if title_candidates else None
+
+    # Strip and assign title
+    metadata["title"] = title.strip() if title else None

    # Meta description - using XPath with multiple attribute conditions
    description = head.xpath('.//meta[@name="description"]/@content')
--- a/docs/examples/scraping_strategies_performance.py
+++ b/docs/examples/scraping_strategies_performance.py
@@ -1,6 +1,12 @@
-import time, re
-from crawl4ai.content_scraping_strategy import WebScrapingStrategy,  LXMLWebScrapingStrategy
 import time
+import os
+import sys
+
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 import functools
 from collections import defaultdict

@@ -57,7 +63,7 @@ methods_to_profile = [


 # Apply decorators to both strategies
-for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
+for strategy, name in [(LXMLWebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
    for method in methods_to_profile:
        apply_decorators(strategy, method, name)

@@ -85,7 +91,7 @@ def generate_large_html(n_elements=1000):

 def test_scraping():
    # Initialize both scrapers
-    original_scraper = WebScrapingStrategy()
+    original_scraper = LXMLWebScrapingStrategy()
    selected_scraper = LXMLWebScrapingStrategy()
    
    # Generate test HTML
--- a/tests/async/test_content_scraper_strategy.py
+++ b/tests/async/test_content_scraper_strategy.py
@@ -12,10 +12,10 @@ parent_dir = os.path.dirname(
 sys.path.append(parent_dir)
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

-from crawl4ai.content_scraping_strategy import WebScrapingStrategy
-from crawl4ai.content_scraping_strategy import (
-    WebScrapingStrategy as WebScrapingStrategyCurrent,
-)
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+# from crawl4ai.content_scraping_strategy import (
+#     WebScrapingStrategy as WebScrapingStrategyCurrent,
+# )
 # from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent


@@ -32,8 +32,8 @@ class TestResult:

 class StrategyTester:
    def __init__(self):
-        self.new_scraper = WebScrapingStrategy()
-        self.current_scraper = WebScrapingStrategyCurrent()
+        self.new_scraper = LXMLWebScrapingStrategy()
+        self.current_scraper = LXMLWebScrapingStrategy()
        with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
            self.WIKI_HTML = f.read()
        self.results = {"new": [], "current": []}
--- a/tests/async/test_evaluation_scraping_methods_performance.configs.py
+++ b/tests/async/test_evaluation_scraping_methods_performance.configs.py
@@ -2,7 +2,6 @@ import json
 import time
 from bs4 import BeautifulSoup
 from crawl4ai.content_scraping_strategy import (
-    WebScrapingStrategy,
    LXMLWebScrapingStrategy,
 )
 from typing import Dict, List, Tuple
@@ -274,7 +273,7 @@ def get_test_scenarios():
    that will be passed into scrap() for testing various features.
    """
    TEST_SCENARIOS = {
-        # "default": {},
+        "default": {},
        # "exclude_domains": {
        #     "exclude_domains": {"images.example.com", "ads.example.com"}
        # },
@@ -609,19 +608,26 @@ class ScraperEquivalenceTester:
        print("\n=== Testing complicated HTML with multiple parameter scenarios ===")

        # Create the scrapers once (or you can re-create if needed)
-        original = WebScrapingStrategy()
+        # original = WebScrapingStrategy()
+        original = LXMLWebScrapingStrategy()
        lxml = LXMLWebScrapingStrategy()
+        
+        # Base URL for testing
+        url = "http://test.com"
+        url = "https://kidocode.com"

        for scenario_name, params in get_test_scenarios().items():
            print(f"\nScenario: {scenario_name}")

            start = time.time()
-            orig_result = original.scrap("http://test.com", complicated_html, **params)
+            orig_result = original.scrap(url, complicated_html, **params)
            orig_time = time.time() - start
+            orig_result = orig_result.model_dump()

            start = time.time()
-            lxml_result = lxml.scrap("http://test.com", complicated_html, **params)
+            lxml_result = lxml.scrap(url, complicated_html, **params)
            lxml_time = time.time() - start
+            lxml_result = lxml_result.model_dump()

            diffs = {}
            link_diff = self.deep_compare_links(
Author	SHA1	Message	Date
ntohidi	2b3b728dcd	fix(metadata): improve title extraction with fallbacks for edge cases. REF #995 Some pages include a <title> tag in HTML but lxml fails to parse it due to unusual structure. Added fallback logic using .find() and OpenGraph/Twitter meta tags to ensure reliable title extraction.	2025-05-28 10:17:50 +02:00
ntohidi	bfec5156ad	Refactor content scraping strategies: comment out WebScrapingStrategy references and update to use LXMLWebScrapingStrategy across multiple files. Bring WebScrapingStrategy methods to LXMLWebScrapingStrategy	2025-05-27 17:32:45 +02:00