diff --git a/CHANGELOG.md b/CHANGELOG.md index b79b37ed..6ab493ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,39 @@ # Changelog +## [v0.3.72] - 2024-10-22 + +### Added +- New `ContentCleaningStrategy` class: + - Smart content extraction based on text density and element scoring + - Automatic removal of boilerplate content + - DOM tree analysis for better content identification + - Configurable thresholds for content detection +- Advanced proxy support: + - Added `proxy_config` option for authenticated proxy connections + - Support for username/password in proxy configuration +- New content output formats: + - `fit_markdown`: Optimized markdown output with main content focus + - `fit_html`: Clean HTML with only essential content + +### Enhanced +- Image source detection: + - Support for multiple image source attributes (`src`, `data-src`, `srcset`, etc.) + - Automatic fallback through potential source attributes + - Smart handling of srcset attribute +- External content handling: + - Made external link exclusion optional (disabled by default) + - Improved detection and handling of social media links + - Better control over external image filtering + +### Fixed +- Image extraction reliability with multiple source attribute checks +- External link and image handling logic for better accuracy + +### Developer Notes +- The new `ContentCleaningStrategy` uses configurable thresholds for customization +- Proxy configuration now supports more complex authentication scenarios +- Content extraction process now provides both regular and optimized outputs + ## [v0.3.72] - 2024-10-20 ### Fixed diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index ca6e4c5b..1ddb32da 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -71,6 +71,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" ) self.proxy = kwargs.get("proxy") + self.proxy_config = kwargs.get("proxy_config") self.headless = kwargs.get("headless", True) self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) @@ -121,6 +122,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.proxy: proxy_settings = ProxySettings(server=self.proxy) browser_args["proxy"] = proxy_settings + elif self.proxy_config: + proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) + browser_args["proxy"] = proxy_settings # Select the appropriate browser based on the browser_type if self.browser_type == "firefox": diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 2a12c775..005523eb 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -212,6 +212,8 @@ class AsyncWebCrawler: cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) markdown = sanitize_input_encode(result.get("markdown", "")) + fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) + fit_html = sanitize_input_encode(result.get("fit_html", "")) media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) @@ -258,6 +260,8 @@ class AsyncWebCrawler: html=html, cleaned_html=format_html(cleaned_html), markdown=markdown, + fit_markdown=fit_markdown, + fit_html= fit_html, media=media, links=links, metadata=metadata, diff --git a/crawl4ai/content_cleaning_strategy.py b/crawl4ai/content_cleaning_strategy.py new file mode 100644 index 00000000..2f052f76 --- /dev/null +++ b/crawl4ai/content_cleaning_strategy.py @@ -0,0 +1,196 @@ +from bs4 import BeautifulSoup, Tag +import re +from typing import Optional + +class ContentCleaningStrategy: + def __init__(self): + # Precompile regex patterns for performance + self.negative_patterns = re.compile(r'nav|footer|header|sidebar|ads|comment', re.I) + self.positive_patterns = re.compile(r'content|article|main|post', re.I) + self.priority_tags = {'article', 'main', 'section', 'div'} + self.non_content_tags = {'nav', 'footer', 'header', 'aside'} + # Thresholds + self.text_density_threshold = 9.0 + self.min_word_count = 50 + self.link_density_threshold = 0.2 + self.max_dom_depth = 10 # To prevent excessive DOM traversal + + def clean(self, clean_html: str) -> str: + """ + Main function that takes cleaned HTML and returns super cleaned HTML. + + Args: + clean_html (str): The cleaned HTML content. + + Returns: + str: The super cleaned HTML containing only the main content. + """ + try: + if not clean_html or not isinstance(clean_html, str): + return '' + soup = BeautifulSoup(clean_html, 'html.parser') + main_content = self.extract_main_content(soup) + if main_content: + super_clean_element = self.clean_element(main_content) + return str(super_clean_element) + else: + return '' + except Exception: + # Handle exceptions silently or log them as needed + return '' + + def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]: + """ + Identifies and extracts the main content element from the HTML. + + Args: + soup (BeautifulSoup): The parsed HTML soup. + + Returns: + Optional[Tag]: The Tag object containing the main content, or None if not found. + """ + candidates = [] + for element in soup.find_all(self.priority_tags): + if self.is_non_content_tag(element): + continue + if self.has_negative_class_id(element): + continue + score = self.calculate_content_score(element) + candidates.append((score, element)) + + if not candidates: + return None + + # Sort candidates by score in descending order + candidates.sort(key=lambda x: x[0], reverse=True) + # Select the element with the highest score + best_element = candidates[0][1] + return best_element + + def calculate_content_score(self, element: Tag) -> float: + """ + Calculates a score for an element based on various heuristics. + + Args: + element (Tag): The HTML element to score. + + Returns: + float: The content score of the element. + """ + score = 0.0 + + if self.is_priority_tag(element): + score += 5.0 + if self.has_positive_class_id(element): + score += 3.0 + if self.has_negative_class_id(element): + score -= 3.0 + if self.is_high_text_density(element): + score += 2.0 + if self.is_low_link_density(element): + score += 2.0 + if self.has_sufficient_content(element): + score += 2.0 + if self.has_headings(element): + score += 3.0 + + dom_depth = self.calculate_dom_depth(element) + score += min(dom_depth, self.max_dom_depth) * 0.5 # Adjust weight as needed + + return score + + def is_priority_tag(self, element: Tag) -> bool: + """Checks if the element is a priority tag.""" + return element.name in self.priority_tags + + def is_non_content_tag(self, element: Tag) -> bool: + """Checks if the element is a non-content tag.""" + return element.name in self.non_content_tags + + def has_negative_class_id(self, element: Tag) -> bool: + """Checks if the element has negative indicators in its class or id.""" + class_id = ' '.join(filter(None, [ + self.get_attr_str(element.get('class')), + element.get('id', '') + ])) + return bool(self.negative_patterns.search(class_id)) + + def has_positive_class_id(self, element: Tag) -> bool: + """Checks if the element has positive indicators in its class or id.""" + class_id = ' '.join(filter(None, [ + self.get_attr_str(element.get('class')), + element.get('id', '') + ])) + return bool(self.positive_patterns.search(class_id)) + + @staticmethod + def get_attr_str(attr) -> str: + """Converts an attribute value to a string.""" + if isinstance(attr, list): + return ' '.join(attr) + elif isinstance(attr, str): + return attr + else: + return '' + + def is_high_text_density(self, element: Tag) -> bool: + """Determines if the element has high text density.""" + text_density = self.calculate_text_density(element) + return text_density > self.text_density_threshold + + def calculate_text_density(self, element: Tag) -> float: + """Calculates the text density of an element.""" + text_length = len(element.get_text(strip=True)) + tag_count = len(element.find_all()) + tag_count = tag_count or 1 # Prevent division by zero + return text_length / tag_count + + def is_low_link_density(self, element: Tag) -> bool: + """Determines if the element has low link density.""" + link_density = self.calculate_link_density(element) + return link_density < self.link_density_threshold + + def calculate_link_density(self, element: Tag) -> float: + """Calculates the link density of an element.""" + text = element.get_text(strip=True) + if not text: + return 0.0 + link_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) + return len(link_text) / len(text) if text else 0.0 + + def has_sufficient_content(self, element: Tag) -> bool: + """Checks if the element has sufficient word count.""" + word_count = len(element.get_text(strip=True).split()) + return word_count >= self.min_word_count + + def calculate_dom_depth(self, element: Tag) -> int: + """Calculates the depth of an element in the DOM tree.""" + depth = 0 + current_element = element + while current_element.parent and depth < self.max_dom_depth: + depth += 1 + current_element = current_element.parent + return depth + + def has_headings(self, element: Tag) -> bool: + """Checks if the element contains heading tags.""" + return bool(element.find(['h1', 'h2', 'h3'])) + + def clean_element(self, element: Tag) -> Tag: + """ + Cleans the selected element by removing unnecessary attributes and nested non-content elements. + + Args: + element (Tag): The HTML element to clean. + + Returns: + Tag: The cleaned HTML element. + """ + for tag in element.find_all(['script', 'style', 'aside']): + tag.decompose() + for tag in element.find_all(): + attrs = dict(tag.attrs) + for attr in attrs: + if attr in ['style', 'onclick', 'onmouseover', 'align', 'bgcolor']: + del tag.attrs[attr] + return element diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 139779ea..0c472f0d 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -7,6 +7,7 @@ from .config import * from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema +from .content_cleaning_strategy import ContentCleaningStrategy from .utils import ( sanitize_input_encode, @@ -215,7 +216,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): links['internal'].append(link_data) keep_element = True - if kwargs.get('exclude_external_links', True): + if kwargs.get('exclude_external_links', False): href_parts = href.split('/') href_url_base = href_parts[2] if len(href_parts) > 2 else href if url_base not in href_url_base: @@ -231,9 +232,20 @@ class WebScrappingStrategy(ContentScrappingStrategy): try: if element.name == 'img': + potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original'] + src = element.get('src', '') + while not src and potential_sources: + src = element.get(potential_sources.pop(0), '') + if not src: + element.decompose() + return False + + # If it is srcset pick up the first image + if 'srcset' in element.attrs: + src = element.attrs['srcset'].split(',')[0].split(' ')[0] + # Check flag if we should remove external images if kwargs.get('exclude_external_images', False): - src = element.get('src', '') src_url_base = src.split('/')[2] url_base = url.split('/')[2] if url_base not in src_url_base: @@ -241,7 +253,6 @@ class WebScrappingStrategy(ContentScrappingStrategy): return False if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True): - src = element.get('src', '') src_url_base = src.split('/')[2] url_base = url.split('/')[2] if any(domain in src for domain in social_media_domains): @@ -386,10 +397,16 @@ class WebScrappingStrategy(ContentScrappingStrategy): except Exception as e: print('Error extracting metadata:', str(e)) meta = {} + + cleaner = ContentCleaningStrategy() + fit_html = cleaner.clean(cleaned_html) + fit_markdown = h.handle(fit_html) cleaned_html = sanitize_html(cleaned_html) return { 'markdown': markdown, + 'fit_markdown': fit_markdown, + 'fit_html': fit_html, 'cleaned_html': cleaned_html, 'success': success, 'media': media, diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 151ccb4f..4ac06797 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -14,6 +14,8 @@ class CrawlResult(BaseModel): links: Dict[str, List[Dict]] = {} screenshot: Optional[str] = None markdown: Optional[str] = None + fit_markdown: Optional[str] = None + fit_html: Optional[str] = None extracted_content: Optional[str] = None metadata: Optional[dict] = None error_message: Optional[str] = None