Improved database management and error handling, updated README instructions, refined .gitignore, enhanced async web crawling capabilities, and updated dependencies.

2024-11-04 13:22:13 +08:00
parent 62a86dbe8d
commit 54d5a3a259
11 changed files with 461 additions and 669 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -178,7 +178,7 @@ def escape_json_string(s):
    
    return s

-class CustomHTML2Text(HTML2Text):
+class CustomHTML2Text_v0(HTML2Text):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.inside_pre = False
@@ -981,6 +981,19 @@ def format_html(html_string):
    return soup.prettify()

 def normalize_url(href, base_url):
+    """Normalize URLs to ensure consistent format"""
+    from urllib.parse import urljoin, urlparse
+
+    # Parse base URL to get components
+    parsed_base = urlparse(base_url)
+    if not parsed_base.scheme or not parsed_base.netloc:
+        raise ValueError(f"Invalid base URL format: {base_url}")
+
+    # Use urljoin to handle all cases
+    normalized = urljoin(base_url, href.strip())
+    return normalized
+
+def normalize_url_tmp(href, base_url):
    """Normalize URLs to ensure consistent format"""
    # Extract protocol and domain from base URL
    try: