diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 67cba508..d96f1ded 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -706,9 +706,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: body = flatten_nested_elements(body) base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') for img in imgs: - src = img.get('src', '') - if base64_pattern.match(src): - img['src'] = base64_pattern.sub('', src) + try: + src = img.get('src', '') + if base64_pattern.match(src): + img['src'] = base64_pattern.sub('', src) + except: + pass cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = sanitize_html(cleaned_html) diff --git a/docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md similarity index 99% rename from docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md rename to docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md index c3a57009..c0daacad 100644 --- a/docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md +++ b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md @@ -113,4 +113,4 @@ Here’s a clear and focused outline for the **Media Handling: Images, Videos, a --- -This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. \ No newline at end of file +This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. diff --git a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md rename to docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md index a8a357af..b460ff8c 100644 --- a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md +++ b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md @@ -183,4 +183,4 @@ Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, cove --- -This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. \ No newline at end of file +This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md rename to docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md index 900c32f2..3682425f 100644 --- a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md +++ b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md @@ -150,4 +150,4 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove --- -This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. \ No newline at end of file +This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md rename to docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md index 61e210e4..9f1c00ea 100644 --- a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md +++ b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md @@ -133,4 +133,4 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove --- -This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently. \ No newline at end of file +This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently.