Merge branch 'proxy-support' into staging

2024-09-01 16:35:14 +08:00
parent 0d9b638636 3caf48c9be
commit b0e8b66666
6 changed files with 51 additions and 13 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -643,7 +643,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
        return node

    body = flatten_nested_elements(body)
-
+    base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
+    for img in imgs:
+        src = img.get('src', '')
+        if base64_pattern.match(src):
+            # Replace base64 data with empty string
+            img['src'] = base64_pattern.sub('', src)
    cleaned_html = str(body).replace('\n\n', '\n').replace('  ', ' ')
    cleaned_html = sanitize_html(cleaned_html)

@@ -725,7 +730,7 @@ def extract_xml_data(tags, string):
    return data
    
 # Function to perform the completion with exponential backoff
-def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False):
+def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
    from litellm import completion 
    from litellm.exceptions import RateLimitError
    max_attempts = 3
@@ -744,6 +749,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
                ],
                temperature=0.01,
                api_key=api_token,
+                base_url=base_url,
                **extra_args
            )
            return response  # Return the successful response
@@ -764,7 +770,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
                    "content": ["Rate limit error. Please try again later."]
                }]
    
-def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
+def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
    # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
    api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
    
@@ -779,7 +785,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
            "{" + variable + "}", variable_values[variable]
        )
        
-    response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
+    response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
        
    try:
        blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
@@ -872,17 +878,17 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):

    return merged_sections

-def process_sections(url: str, sections: list, provider: str, api_token: str) -> list:
+def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
    extracted_content = []
    if provider.startswith("groq/"):
        # Sequential processing with a delay
        for section in sections:
-            extracted_content.extend(extract_blocks(url, section, provider, api_token))
+            extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
            time.sleep(0.5)  # 500 ms delay between each processing
    else:
        # Parallel processing using ThreadPoolExecutor
        with ThreadPoolExecutor() as executor:
-            futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections]
+            futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
            for future in as_completed(futures):
                extracted_content.extend(future.result())