From 24723b2f100ed25747b1b84a833f82e17340b457 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 28 Nov 2024 12:45:05 +0800
Subject: [PATCH] Enhance features and documentation   - Updated version to
 0.3.743   - Improved ManagedBrowser configuration with dynamic host/port   -
 Implemented fast HTML formatting in web crawler   - Enhanced markdown
 generation with a new generator class   - Improved sanitization and utility
 functions   - Added contributor details and pull request acknowledgments   -
 Updated documentation for clearer usage scenarios   - Adjusted tests to
 reflect class name changes

---
 CONTRIBUTORS.md                          |  8 +++
 crawl4ai/__version__.py                  |  2 +-
 crawl4ai/async_crawler_strategy.py       | 24 +++++----
 crawl4ai/async_webcrawler.py             | 12 +++--
 crawl4ai/content_scraping_strategy.py    | 19 ++++---
 crawl4ai/markdown_generation_strategy.py | 14 ++++--
 crawl4ai/utils.py                        | 64 +++++++++++++++++++++---
 docs/md_v2/advanced/hooks-auth.md        |  8 ++-
 tests/async/test_markdown_genertor.py    | 14 +++---
 9 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 0b5dcede..deb46a9c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -10,11 +10,19 @@ We would like to thank the following people for their contributions to Crawl4AI:
 
 ## Community Contributors
 
+- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fix for `CustomHTML2Text` is not defined.
 - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors
 - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies
 - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for
 - [datehoer](https://github.com/datehoer) - Add browser prxy support
 
+## Pull Requests
+
+- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286)
+- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293)
+- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
+
+
 ## Other Contributors
 
 - [Gokhan](https://github.com/gkhngyk) 
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 05bfd336..37e3c08a 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.3.741"
\ No newline at end of file
+__version__ = "0.3.743"
\ No newline at end of file
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 3f332eb0..882f9a50 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -35,13 +35,14 @@ stealth_config = StealthConfig(
 
 
 class ManagedBrowser:
-    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None):
+    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222):
         self.browser_type = browser_type
         self.user_data_dir = user_data_dir
         self.headless = headless
         self.browser_process = None
         self.temp_dir = None
-        self.debugging_port = 9222
+        self.debugging_port = debugging_port
+        self.host = host
         self.logger = logger
         self.shutting_down = False
 
@@ -70,7 +71,7 @@ class ManagedBrowser:
             # Monitor browser process output for errors
             asyncio.create_task(self._monitor_browser_process())
             await asyncio.sleep(2)  # Give browser time to start
-            return f"http://localhost:{self.debugging_port}"
+            return f"http://{self.host}:{self.debugging_port}"
         except Exception as e:
             await self.cleanup()
             raise Exception(f"Failed to start browser: {e}")
@@ -416,13 +417,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         else:
             raise ValueError(f"Invalid hook type: {hook_type}")
 
-    async def execute_hook(self, hook_type: str, *args):
+    async def execute_hook(self, hook_type: str, *args, **kwargs):
         hook = self.hooks.get(hook_type)
         if hook:
             if asyncio.iscoroutinefunction(hook):
-                return await hook(*args)
+                return await hook(*args, **kwargs)
             else:
-                return hook(*args)
+                return hook(*args, **kwargs)
         return args[0] if args else None
 
     def update_user_agent(self, user_agent: str):
@@ -642,6 +643,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         session_id = kwargs.get("session_id")
         
         # Handle page creation differently for managed browser
+        context = None
         if self.use_managed_browser:
             if session_id:
                 # Reuse existing session if available
@@ -760,7 +762,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     return response
 
             if not kwargs.get("js_only", False):
-                await self.execute_hook('before_goto', page)
+                await self.execute_hook('before_goto', page, context = context)
                 
 
                 response = await page.goto(
@@ -773,7 +775,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 # response = await page.goto("about:blank")
                 # await page.evaluate(f"window.location.href = '{url}'")
                 
-                await self.execute_hook('after_goto', page)
+                await self.execute_hook('after_goto', page, context = context)
                 
                 # Get status code and headers
                 status_code = response.status
@@ -838,7 +840,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 # await page.wait_for_timeout(100)
                 
                 # Check for on execution event
-                await self.execute_hook('on_execution_started', page)
+                await self.execute_hook('on_execution_started', page, context = context)
                 
             if kwargs.get("simulate_user", False) or kwargs.get("magic", False):
                 # Simulate user interactions
@@ -924,7 +926,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             if kwargs.get("process_iframes", False):
                 page = await self.process_iframes(page)
             
-            await self.execute_hook('before_retrieve_html', page)
+            await self.execute_hook('before_retrieve_html', page, context = context)
             # Check if delay_before_return_html is set then wait for that time
             delay_before_return_html = kwargs.get("delay_before_return_html")
             if delay_before_return_html:
@@ -935,7 +937,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 await self.remove_overlay_elements(page)
             
             html = await page.content()
-            await self.execute_hook('before_return_html', page, html)
+            await self.execute_hook('before_return_html', page, html, context = context)
             
             # Check if kwargs has screenshot=True then take screenshot
             screenshot_data = None
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index b8be6f35..5a46fe39 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -25,7 +25,8 @@ from .config import (
 from .utils import (
     sanitize_input_encode,
     InvalidCSSSelectorError,
-    format_html
+    format_html,
+    fast_format_html
 )
 from urllib.parse import urlparse
 import random
@@ -534,16 +535,17 @@ class AsyncWebCrawler:
                     "timing": time.perf_counter() - t1
                 }
             )
-        
-
-                
 
         screenshot = None if not screenshot else screenshot
         
+        
+        if kwargs.get("prettiify", False):
+            cleaned_html = fast_format_html(cleaned_html)
+        
         return CrawlResult(
             url=url,
             html=html,
-            cleaned_html=format_html(cleaned_html),
+            cleaned_html=cleaned_html,
             markdown_v2=markdown_v2,
             markdown=markdown,
             fit_markdown=fit_markdown,
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index ea6a2ef8..ec6c3361 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -10,7 +10,7 @@ from urllib.parse import urljoin
 from requests.exceptions import InvalidSchema
 # from .content_cleaning_strategy import ContentCleaningStrategy
 from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
-from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy
+from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
 from .models import MarkdownGenerationResult
 from .utils import (
     sanitize_input_encode,
@@ -105,21 +105,28 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         Returns:
             Dict containing markdown content in various formats
         """
-        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy())
+        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
         
         if markdown_generator:
             try:
+                if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
+                        markdown_generator.content_filter = BM25ContentFilter(
+                            user_query=kwargs.get('fit_markdown_user_query', None),
+                            bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
+                        )
+                
                 markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                     cleaned_html=cleaned_html,
                     base_url=url,
-                    html2text_options=kwargs.get('html2text', {}),
-                    content_filter=kwargs.get('content_filter', None)
+                    html2text_options=kwargs.get('html2text', {})
                 )
                 
+                help_message = """"""
+                
                 return {
                     'markdown': markdown_result.raw_markdown,  
-                    'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
+                    'fit_markdown': markdown_result.fit_markdown,
+                    'fit_html': markdown_result.fit_html, 
                     'markdown_v2': markdown_result
                 }
             except Exception as e:
diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
index 7922c413..b1e43f9d 100644
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -11,6 +11,8 @@ LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
 
 class MarkdownGenerationStrategy(ABC):
     """Abstract base class for markdown generation strategies."""
+    def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
+        self.content_filter = content_filter
     
     @abstractmethod
     def generate_markdown(self, 
@@ -23,8 +25,10 @@ class MarkdownGenerationStrategy(ABC):
         """Generate markdown from cleaned HTML."""
         pass
 
-class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
+class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
     """Default implementation of markdown generation strategy."""
+    def __init__(self, content_filter: Optional[RelevantContentFilter] = None):
+        super().__init__(content_filter)
     
     def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
         link_map = {}
@@ -84,14 +88,18 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
         raw_markdown = raw_markdown.replace('    ```', '```')
 
         # Convert links to citations
+        markdown_with_citations: str = ""
+        references_markdown: str = ""
         if citations:
             markdown_with_citations, references_markdown = self.convert_links_to_citations(
                 raw_markdown, base_url
             )
 
         # Generate fit markdown if content filter is provided
-        fit_markdown: Optional[str] = None
-        if content_filter:
+        fit_markdown: Optional[str] = ""
+        filtered_html: Optional[str] = ""
+        if content_filter or self.content_filter:
+            content_filter = content_filter or self.content_filter
             filtered_html = content_filter.filter_content(cleaned_html)
             filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
             fit_markdown = h.handle(filtered_html)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index b07562df..aaf27e91 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -233,12 +233,17 @@ def sanitize_html(html):
 def sanitize_input_encode(text: str) -> str:
     """Sanitize input to handle potential encoding issues."""
     try:
-        # Attempt to encode and decode as UTF-8 to handle potential encoding issues
-        return text.encode('utf-8', errors='ignore').decode('utf-8')
-    except UnicodeEncodeError as e:
-        print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
-        # Fall back to ASCII if UTF-8 fails
-        return text.encode('ascii', errors='ignore').decode('ascii')
+        try:
+            if not text:
+                return ''
+            # Attempt to encode and decode as UTF-8 to handle potential encoding issues
+            return text.encode('utf-8', errors='ignore').decode('utf-8')
+        except UnicodeEncodeError as e:
+            print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
+            # Fall back to ASCII if UTF-8 fails
+            return text.encode('ascii', errors='ignore').decode('ascii')
+    except Exception as e:
+        raise ValueError(f"Error sanitizing input: {str(e)}") from e
 
 def escape_json_string(s):
     """
@@ -1079,9 +1084,54 @@ def wrap_text(draw, text, font, max_width):
     return '\n'.join(lines)
 
 def format_html(html_string):
-    soup = BeautifulSoup(html_string, 'html.parser')
+    soup = BeautifulSoup(html_string, 'lxml.parser')
     return soup.prettify()
 
+def fast_format_html(html_string):
+    """
+    A fast HTML formatter that uses string operations instead of parsing.
+    
+    Args:
+        html_string (str): The HTML string to format
+        
+    Returns:
+        str: The formatted HTML string
+    """
+    # Initialize variables
+    indent = 0
+    indent_str = "  "  # Two spaces for indentation
+    formatted = []
+    in_content = False
+    
+    # Split by < and > to separate tags and content
+    parts = html_string.replace('>', '>\n').replace('<', '\n<').split('\n')
+    
+    for part in parts:
+        if not part.strip():
+            continue
+            
+        # Handle closing tags
+        if part.startswith('</'):
+            indent -= 1
+            formatted.append(indent_str * indent + part)
+            
+        # Handle self-closing tags
+        elif part.startswith('<') and part.endswith('/>'):
+            formatted.append(indent_str * indent + part)
+            
+        # Handle opening tags
+        elif part.startswith('<'):
+            formatted.append(indent_str * indent + part)
+            indent += 1
+            
+        # Handle content between tags
+        else:
+            content = part.strip()
+            if content:
+                formatted.append(indent_str * indent + content)
+    
+    return '\n'.join(formatted)
+
 def normalize_url(href, base_url):
     """Normalize URLs to ensure consistent format"""
     from urllib.parse import urljoin, urlparse
diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md
index e4b7d7ce..8da3a1cc 100644
--- a/docs/md_v2/advanced/hooks-auth.md
+++ b/docs/md_v2/advanced/hooks-auth.md
@@ -18,7 +18,7 @@ Let's see how we can customize the AsyncWebCrawler using hooks! In this example,
 import asyncio
 from crawl4ai import AsyncWebCrawler
 from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
-from playwright.async_api import Page, Browser
+from playwright.async_api import Page, Browser, BrowserContext
 
 async def on_browser_created(browser: Browser):
     print("[HOOK] on_browser_created")
@@ -71,7 +71,11 @@ from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
 async def main():
     print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!")
     
-    crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True)
+    initial_cookies = [
+        {"name": "sessionId", "value": "abc123", "domain": ".example.com"},
+        {"name": "userId", "value": "12345", "domain": ".example.com"}
+    ]
+    crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True, cookies=initial_cookies)
     crawler_strategy.set_hook('on_browser_created', on_browser_created)
     crawler_strategy.set_hook('before_goto', before_goto)
     crawler_strategy.set_hook('after_goto', after_goto)
diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py
index 025a0318..2b1102ab 100644
--- a/tests/async/test_markdown_genertor.py
+++ b/tests/async/test_markdown_genertor.py
@@ -11,7 +11,7 @@ import asyncio
 import os
 import time
 from typing import Dict, Any
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
 # Get current directory
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
@@ -41,7 +41,7 @@ def test_basic_markdown_conversion():
     with open(__location__ + "/data/wikipedia.html", "r") as f:
         cleaned_html = f.read()
 
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
     
     start_time = time.perf_counter()
     result = generator.generate_markdown(
@@ -70,7 +70,7 @@ def test_relative_links():
     Also an [image](/images/test.png) and another [page](/wiki/Banana).
     """
     
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
         cleaned_html=markdown,
         base_url="https://en.wikipedia.org"
@@ -86,7 +86,7 @@ def test_duplicate_links():
     Here's a [link](/test) and another [link](/test) and a [different link](/other).
     """
     
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
         cleaned_html=markdown,
         base_url="https://example.com"
@@ -102,7 +102,7 @@ def test_link_descriptions():
     Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
     """
     
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
         cleaned_html=markdown,
         base_url="https://example.com"
@@ -120,7 +120,7 @@ def test_performance_large_document():
     iterations = 5
     times = []
     
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
     
     for i in range(iterations):
         start_time = time.perf_counter()
@@ -144,7 +144,7 @@ def test_image_links():
     And a regular [link](/page).
     """
     
-    generator = DefaultMarkdownGenerationStrategy()
+    generator = DefaultMarkdownGenerator()
     result = generator.generate_markdown(
         cleaned_html=markdown,
         base_url="https://example.com"