Implement new async crawler features and stability updates

- Introduced new async crawl strategy with session management. - Added BrowserManager for improved browser management. - Enhanced documentation, focusing on storage state and usage examples. - Improved error handling and logging for sessions. - Added JavaScript snippets for customizing navigator properties.
2024-12-10 17:55:29 +08:00
parent 2d31915f0a
commit e130fd8db9
16 changed files with 2750 additions and 749 deletions
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Optional, Dict, Any, Tuple
 from .models import MarkdownGenerationResult
-from .utils import CustomHTML2Text
+from .html2text import CustomHTML2Text
 from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
 import re
 from urllib.parse import urljoin
@@ -9,6 +9,17 @@ from urllib.parse import urljoin
 # Pre-compile the regex pattern
 LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')

+def fast_urljoin(base: str, url: str) -> str:
+    """Fast URL joining for common cases."""
+    if url.startswith(('http://', 'https://', 'mailto:', '//')):
+        return url
+    if url.startswith('/'):
+        # Handle absolute paths
+        if base.endswith('/'):
+            return base[:-1] + url
+        return base + url
+    return urljoin(base, url)
+
 class MarkdownGenerationStrategy(ABC):
    """Abstract base class for markdown generation strategies."""
    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
@@ -118,13 +129,3 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
            fit_html=filtered_html,
        )

-def fast_urljoin(base: str, url: str) -> str:
-    """Fast URL joining for common cases."""
-    if url.startswith(('http://', 'https://', 'mailto:', '//')):
-        return url
-    if url.startswith('/'):
-        # Handle absolute paths
-        if base.endswith('/'):
-            return base[:-1] + url
-        return base + url
-    return urljoin(base, url)