From 0bba0e074f720a5d03027ee8fdf699f46ce8af82 Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:12:24 +0100 Subject: [PATCH 1/5] Preventing NoneType has no attribute get Errors Sometimes the list contains Tag elements that do not have attrs set, resulting in this Error. --- crawl4ai/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index baa08a0f..869c22d5 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -706,9 +706,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: body = flatten_nested_elements(body) base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') for img in imgs: - src = img.get('src', '') - if base64_pattern.match(src): - img['src'] = base64_pattern.sub('', src) + try: + src = img.get('src', '') + if base64_pattern.match(src): + img['src'] = base64_pattern.sub('', src) + except: + pass cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = sanitize_html(cleaned_html) From a28046c233059c3dc2c4ce442e5cda6f7f18645b Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:18:26 +0100 Subject: [PATCH 2/5] Rename episode_08_Media_Handling:_Images,_Videos,_and_Audio.md to episode_08_Media_Handling_Images_Videos_and_Audio.md Name that will work in Windows --- ....md => episode_08_Media_Handling_Images_Videos_and_Audio.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_08_Media_Handling:_Images,_Videos,_and_Audio.md => episode_08_Media_Handling_Images_Videos_and_Audio.md} (99%) diff --git a/docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md similarity index 99% rename from docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md rename to docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md index c3a57009..c0daacad 100644 --- a/docs/md_v2/tutorial/episode_08_Media_Handling:_Images,_Videos,_and_Audio.md +++ b/docs/md_v2/tutorial/episode_08_Media_Handling_Images_Videos_and_Audio.md @@ -113,4 +113,4 @@ Here’s a clear and focused outline for the **Media Handling: Images, Videos, a --- -This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. \ No newline at end of file +This outline provides users with a complete guide to handling images, videos, and audio in Crawl4AI, using metadata to enhance relevance and precision in multimedia extraction. From 870296fa7ee43b221cdede34dbe22a8a2ea4ea4c Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:18:58 +0100 Subject: [PATCH 3/5] Rename episode_11_1_Extraction_Strategies:_JSON_CSS.md to episode_11_1_Extraction_Strategies_JSON_CSS.md Name that will work in Windows --- ...ON_CSS.md => episode_11_1_Extraction_Strategies_JSON_CSS.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_11_1_Extraction_Strategies:_JSON_CSS.md => episode_11_1_Extraction_Strategies_JSON_CSS.md} (99%) diff --git a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md rename to docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md index a8a357af..b460ff8c 100644 --- a/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies:_JSON_CSS.md +++ b/docs/md_v2/tutorial/episode_11_1_Extraction_Strategies_JSON_CSS.md @@ -183,4 +183,4 @@ Here’s a detailed outline for the **JSON-CSS Extraction Strategy** video, cove --- -This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. \ No newline at end of file +This outline covers each JSON-CSS Extraction option in Crawl4AI, with practical examples and schema configurations, making it a thorough guide for users. From 3a3c88a2d0d76141179d9284d43021083d1e663b Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:19:20 +0100 Subject: [PATCH 4/5] Rename episode_11_2_Extraction_Strategies:_LLM.md to episode_11_2_Extraction_Strategies_LLM.md Name that will work in Windows --- ...tegies:_LLM.md => episode_11_2_Extraction_Strategies_LLM.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_11_2_Extraction_Strategies:_LLM.md => episode_11_2_Extraction_Strategies_LLM.md} (99%) diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md rename to docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md index 900c32f2..3682425f 100644 --- a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies:_LLM.md +++ b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md @@ -150,4 +150,4 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove --- -This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. \ No newline at end of file +This outline explains LLM Extraction in Crawl4AI, with examples showing how to extract structured data using custom schemas and instructions. It demonstrates flexibility with multiple providers, ensuring practical application for different use cases. From 796dbaf08c92efd606c5b82d00168c29702f6927 Mon Sep 17 00:00:00 2001 From: bizrockman Date: Mon, 4 Nov 2024 20:19:43 +0100 Subject: [PATCH 5/5] Rename episode_11_3_Extraction_Strategies:_Cosine.md to episode_11_3_Extraction_Strategies_Cosine.md Name that will work in Windows --- ...:_Cosine.md => episode_11_3_Extraction_Strategies_Cosine.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename docs/md_v2/tutorial/{episode_11_3_Extraction_Strategies:_Cosine.md => episode_11_3_Extraction_Strategies_Cosine.md} (99%) diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md similarity index 99% rename from docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md rename to docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md index 61e210e4..9f1c00ea 100644 --- a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies:_Cosine.md +++ b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md @@ -133,4 +133,4 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove --- -This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently. \ No newline at end of file +This outline covers Cosine Similarity Strategy’s speed and effectiveness, providing examples that showcase its potential for clustering various content types efficiently.