Enhance features and documentation

- Updated version to 0.3.743 - Improved ManagedBrowser configuration with dynamic host/port - Implemented fast HTML formatting in web crawler - Enhanced markdown generation with a new generator class - Improved sanitization and utility functions - Added contributor details and pull request acknowledgments - Updated documentation for clearer usage scenarios - Adjusted tests to reflect class name changes
2024-11-28 12:45:05 +08:00
parent 829a1f7992
commit 24723b2f10
9 changed files with 123 additions and 42 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.3.741"
+__version__ = "0.3.743"
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -35,13 +35,14 @@ stealth_config = StealthConfig(


 class ManagedBrowser:
-    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None):
+    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222):
        self.browser_type = browser_type
        self.user_data_dir = user_data_dir
        self.headless = headless
        self.browser_process = None
        self.temp_dir = None
-        self.debugging_port = 9222
+        self.debugging_port = debugging_port
+        self.host = host
        self.logger = logger
        self.shutting_down = False

@@ -70,7 +71,7 @@ class ManagedBrowser:
            # Monitor browser process output for errors
            asyncio.create_task(self._monitor_browser_process())
            await asyncio.sleep(2)  # Give browser time to start
-            return f"http://localhost:{self.debugging_port}"
+            return f"http://{self.host}:{self.debugging_port}"
        except Exception as e:
            await self.cleanup()
            raise Exception(f"Failed to start browser: {e}")
@@ -416,13 +417,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        else:
            raise ValueError(f"Invalid hook type: {hook_type}")

-    async def execute_hook(self, hook_type: str, *args):
+    async def execute_hook(self, hook_type: str, *args, **kwargs):
        hook = self.hooks.get(hook_type)
        if hook:
            if asyncio.iscoroutinefunction(hook):
-                return await hook(*args)
+                return await hook(*args, **kwargs)
            else:
-                return hook(*args)
+                return hook(*args, **kwargs)
        return args[0] if args else None

    def update_user_agent(self, user_agent: str):
@@ -642,6 +643,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        session_id = kwargs.get("session_id")
        
        # Handle page creation differently for managed browser
+        context = None
        if self.use_managed_browser:
            if session_id:
                # Reuse existing session if available
@@ -760,7 +762,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    return response

            if not kwargs.get("js_only", False):
-                await self.execute_hook('before_goto', page)
+                await self.execute_hook('before_goto', page, context = context)
                

                response = await page.goto(
@@ -773,7 +775,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                # response = await page.goto("about:blank")
                # await page.evaluate(f"window.location.href = '{url}'")
                
-                await self.execute_hook('after_goto', page)
+                await self.execute_hook('after_goto', page, context = context)
                
                # Get status code and headers
                status_code = response.status
@@ -838,7 +840,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                # await page.wait_for_timeout(100)
                
                # Check for on execution event
-                await self.execute_hook('on_execution_started', page)
+                await self.execute_hook('on_execution_started', page, context = context)
                
            if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
                # Simulate user interactions
@@ -924,7 +926,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if kwargs.get("process_iframes", False):
                page = await self.process_iframes(page)
            
-            await self.execute_hook('before_retrieve_html', page)
+            await self.execute_hook('before_retrieve_html', page, context = context)
            # Check if delay_before_return_html is set then wait for that time
            delay_before_return_html = kwargs.get("delay_before_return_html")
            if delay_before_return_html:
@@ -935,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                await self.remove_overlay_elements(page)
            
            html = await page.content()
-            await self.execute_hook('before_return_html', page, html)
+            await self.execute_hook('before_return_html', page, html, context = context)
            
            # Check if kwargs has screenshot=True then take screenshot
            screenshot_data = None
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -25,7 +25,8 @@ from .config import (
 from .utils import (
    sanitize_input_encode,
    InvalidCSSSelectorError,
-    format_html
+    format_html,
+    fast_format_html
 )
 from urllib.parse import urlparse
 import random
@@ -534,16 +535,17 @@ class AsyncWebCrawler:
                    "timing": time.perf_counter() - t1
                }
            )
-        
-
-                

        screenshot = None if not screenshot else screenshot
        
+        
+        if kwargs.get("prettiify", False):
+            cleaned_html = fast_format_html(cleaned_html)
+        
        return CrawlResult(
            url=url,
            html=html,
-            cleaned_html=format_html(cleaned_html),
+            cleaned_html=cleaned_html,
            markdown_v2=markdown_v2,
            markdown=markdown,
            fit_markdown=fit_markdown,
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -10,7 +10,7 @@ from urllib.parse import urljoin
 from requests.exceptions import InvalidSchema
 # from .content_cleaning_strategy import ContentCleaningStrategy
 from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
-from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy
+from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
 from .models import MarkdownGenerationResult
 from .utils import (
    sanitize_input_encode,
@@ -105,21 +105,28 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        Returns:
            Dict containing markdown content in various formats
        """
-        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy())
+        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
        
        if markdown_generator:
            try:
+                if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
+                        markdown_generator.content_filter = BM25ContentFilter(
+                            user_query=kwargs.get('fit_markdown_user_query', None),
+                            bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
+                        )
+                
                markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                    cleaned_html=cleaned_html,
                    base_url=url,
-                    html2text_options=kwargs.get('html2text', {}),
-                    content_filter=kwargs.get('content_filter', None)
+                    html2text_options=kwargs.get('html2text', {})
                )
                
+                help_message = """"""
+                
                return {
                    'markdown': markdown_result.raw_markdown,  
-                    'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
+                    'fit_markdown': markdown_result.fit_markdown,
+                    'fit_html': markdown_result.fit_html, 
                    'markdown_v2': markdown_result
                }
            except Exception as e:
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -11,6 +11,8 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')

 class MarkdownGenerationStrategy(ABC):
    """Abstract base class for markdown generation strategies."""
+    def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
+        self.content_filter = content_filter
    
    @abstractmethod
    def generate_markdown(self, 
@@ -23,8 +25,10 @@ class MarkdownGenerationStrategy(ABC):
        """Generate markdown from cleaned HTML."""
        pass

-class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
+class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
    """Default implementation of markdown generation strategy."""
+    def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
+        super().__init__(content_filter)
    
    def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
        link_map = {}
@@ -84,14 +88,18 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
        raw_markdown = raw_markdown.replace('    ```', '```')

        # Convert links to citations
+        markdown_with_citations: str = ""
+        references_markdown: str = ""
        if citations:
            markdown_with_citations, references_markdown = self.convert_links_to_citations(
                raw_markdown, base_url
            )

        # Generate fit markdown if content filter is provided
-        fit_markdown: Optional[str] = None
-        if content_filter:
+        fit_markdown: Optional[str] = ""
+        filtered_html: Optional[str] = ""
+        if content_filter or self.content_filter:
+            content_filter = content_filter or self.content_filter
            filtered_html = content_filter.filter_content(cleaned_html)
            filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
            fit_markdown = h.handle(filtered_html)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -233,12 +233,17 @@ def sanitize_html(html):
 def sanitize_input_encode(text: str) -> str:
    """Sanitize input to handle potential encoding issues."""
    try:
-        # Attempt to encode and decode as UTF-8 to handle potential encoding issues
-        return text.encode('utf-8', errors='ignore').decode('utf-8')
-    except UnicodeEncodeError as e:
-        print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
-        # Fall back to ASCII if UTF-8 fails
-        return text.encode('ascii', errors='ignore').decode('ascii')
+        try:
+            if not text:
+                return ''
+            # Attempt to encode and decode as UTF-8 to handle potential encoding issues
+            return text.encode('utf-8', errors='ignore').decode('utf-8')
+        except UnicodeEncodeError as e:
+            print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
+            # Fall back to ASCII if UTF-8 fails
+            return text.encode('ascii', errors='ignore').decode('ascii')
+    except Exception as e:
+        raise ValueError(f"Error sanitizing input: {str(e)}") from e

 def escape_json_string(s):
    """
@@ -1079,9 +1084,54 @@ def wrap_text(draw, text, font, max_width):
    return '\n'.join(lines)

 def format_html(html_string):
-    soup = BeautifulSoup(html_string, 'html.parser')
+    soup = BeautifulSoup(html_string, 'lxml.parser')
    return soup.prettify()

+def fast_format_html(html_string):
+    """
+    A fast HTML formatter that uses string operations instead of parsing.
+    
+    Args:
+        html_string (str): The HTML string to format
+        
+    Returns:
+        str: The formatted HTML string
+    """
+    # Initialize variables
+    indent = 0
+    indent_str = "  "  # Two spaces for indentation
+    formatted = []
+    in_content = False
+    
+    # Split by < and > to separate tags and content
+    parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n')
+    
+    for part in parts:
+        if not part.strip():
+            continue
+            
+        # Handle closing tags
+        if part.startswith('</'):
+            indent -= 1
+            formatted.append(indent_str * indent + part)
+            
+        # Handle self-closing tags
+        elif part.startswith('<') and part.endswith('/>'):
+            formatted.append(indent_str * indent + part)
+            
+        # Handle opening tags
+        elif part.startswith('<'):
+            formatted.append(indent_str * indent + part)
+            indent += 1
+            
+        # Handle content between tags
+        else:
+            content = part.strip()
+            if content:
+                formatted.append(indent_str * indent + content)
+    
+    return '\n'.join(formatted)
+
 def normalize_url(href, base_url):
    """Normalize URLs to ensure consistent format"""
    from urllib.parse import urljoin, urlparse