From 1dd36f90359b2a487c9d3faf89711b4818c7c833 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 20 Oct 2024 19:11:18 +0800 Subject: [PATCH] Refactor content scrapping strategy and improve error handling --- crawl4ai/async_webcrawler.py | 2 +- crawl4ai/content_scrapping_strategy.py | 112 +++++++++++++------------ 2 files changed, 61 insertions(+), 53 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 94938b60..2a12c775 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -187,7 +187,7 @@ class AsyncWebCrawler: t1 = time.time() scrapping_strategy = WebScrappingStrategy() # result = await scrapping_strategy.ascrap( - result = await scrapping_strategy.scrap( + result = scrapping_strategy.scrap( url, html, word_count_threshold=word_count_threshold, diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index 640d1d15..da163539 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -196,52 +196,65 @@ class WebScrappingStrategy(ContentScrappingStrategy): return False keep_element = False + + social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', []) + social_media_domains = list(set(social_media_domains)) - if element.name == 'a' and element.get('href'): - href = element['href'] - url_base = url.split('/')[2] - link_data = {'href': href, 'text': element.get_text()} - if href.startswith('http') and url_base not in href: - links['external'].append(link_data) - else: - links['internal'].append(link_data) - keep_element = True - - if kwargs.get('exclude_external_links', True): - href_url_base = href.split('/')[2] - if url_base not in href_url_base: - element.decompose() - return False - - # Check if we should esclude links to all major social media platforms - if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True): - social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', []) - social_media_domains = list(set(social_media_domains)) - if any(domain in href for domain in social_media_domains): - element.decompose() - return False - - elif element.name == 'img': - # Check flag if we should remove external images - if kwargs.get('exclude_external_images', False): - src = element.get('src', '') - src_url_base = src.split('/')[2] + try: + if element.name == 'a' and element.get('href'): + href = element['href'] url_base = url.split('/')[2] - if url_base not in src_url_base: - element.decompose() - return False + link_data = {'href': href, 'text': element.get_text()} + if href.startswith('http') and url_base not in href: + links['external'].append(link_data) + else: + links['internal'].append(link_data) + keep_element = True - if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True): - src = element.get('src', '') - src_url_base = src.split('/')[2] - url_base = url.split('/')[2] - if any(domain in src for domain in SOCIAL_MEDIA_DOMAINS): - element.decompose() - return False - - return True # Always keep image elements + if kwargs.get('exclude_external_links', True): + href_parts = href.split('/') + href_url_base = href_parts[2] if len(href_parts) > 2 else href + if url_base not in href_url_base: + element.decompose() + return False + + if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True): + if any(domain in href for domain in social_media_domains): + element.decompose() + return False + except Exception as e: + raise "Error processing links" - elif element.name in ['video', 'audio']: + try: + if element.name == 'img': + # Check flag if we should remove external images + if kwargs.get('exclude_external_images', False): + src = element.get('src', '') + src_url_base = src.split('/')[2] + url_base = url.split('/')[2] + if url_base not in src_url_base: + element.decompose() + return False + + if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True): + src = element.get('src', '') + src_url_base = src.split('/')[2] + url_base = url.split('/')[2] + if any(domain in src for domain in social_media_domains): + element.decompose() + return False + + return True # Always keep image elements + except Exception as e: + raise "Error processing images" + + + # Check if flag to remove all forms is set + if kwargs.get('remove_forms', False) and element.name == 'form': + element.decompose() + return False + + if element.name in ['video', 'audio']: media[f"{element.name}s"].append({ 'src': element.get('src'), 'alt': element.get('alt'), @@ -262,13 +275,11 @@ class WebScrappingStrategy(ContentScrappingStrategy): if kwargs.get('only_text', False): element.replace_with(element.get_text()) - remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) - # for attr in element.attrs: - # if attr not in IMPORTANT_ATTRS or (attr.startswith('data-') and not kwargs.get('keep_data_attributes', False)): - # del element[attr] - - # Print element name and attributes - print(element.name, element.attrs) + try: + remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) + except Exception as e: + print('Error removing unwanted attributes:', str(e)) + # Process children for child in list(element.children): @@ -304,9 +315,6 @@ class WebScrappingStrategy(ContentScrappingStrategy): process_element(body) # # Process images using ThreadPoolExecutor - - - imgs = body.find_all('img') with ThreadPoolExecutor() as executor: