Refactor content scrapping strategy and improve error handling

2024-10-20 19:11:18 +08:00
parent 6ec4cb33ca
commit 1dd36f9035
2 changed files with 61 additions and 53 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -187,7 +187,7 @@ class AsyncWebCrawler:
            t1 = time.time()
            scrapping_strategy = WebScrappingStrategy()
            # result = await scrapping_strategy.ascrap(
-            result = await scrapping_strategy.scrap(
+            result = scrapping_strategy.scrap(
                url,
                html,
                word_count_threshold=word_count_threshold,
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -197,6 +197,10 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                keep_element = False
                social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
                social_media_domains = list(set(social_media_domains))
                try:
                    if element.name == 'a' and element.get('href'):
                        href = element['href']
                        url_base = url.split('/')[2]
@@ -208,20 +212,21 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                        keep_element = True
                        if kwargs.get('exclude_external_links', True):
-                        href_url_base = href.split('/')[2]
+                            href_parts = href.split('/')
                            href_url_base = href_parts[2] if len(href_parts) > 2 else href
                            if url_base not in href_url_base:
                                element.decompose()
                                return False
                    # Check if we should esclude links to all major social media platforms
                        if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
                        social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
                        social_media_domains = list(set(social_media_domains))
                            if any(domain in href for domain in social_media_domains):
                                element.decompose()
                                return False
                except Exception as e:
                    raise "Error processing links"
-                elif element.name == 'img':
+                try:
                    if element.name == 'img':
                        # Check flag if we should remove external images
                        if kwargs.get('exclude_external_images', False):
                            src = element.get('src', '')
@@ -235,13 +240,21 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                            src = element.get('src', '')
                            src_url_base = src.split('/')[2]
                            url_base = url.split('/')[2]
-                        if any(domain in src for domain in SOCIAL_MEDIA_DOMAINS):
+                            if any(domain in src for domain in social_media_domains):
                                element.decompose()
                                return False
                        return True  # Always keep image elements
                except Exception as e:
                    raise "Error processing images"
-                elif element.name in ['video', 'audio']:
+                
                # Check if flag to remove all forms is set
                if kwargs.get('remove_forms', False) and element.name == 'form':
                    element.decompose()
                    return False
                if element.name in ['video', 'audio']:
                    media[f"{element.name}s"].append({
                        'src': element.get('src'),
                        'alt': element.get('alt'),
@@ -262,13 +275,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
                    if kwargs.get('only_text', False):
                        element.replace_with(element.get_text())
                try:
                    remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
-                # for attr in element.attrs:
+                except Exception as e:
-                #     if attr not in IMPORTANT_ATTRS or (attr.startswith('data-') and not kwargs.get('keep_data_attributes', False)):
+                    print('Error removing unwanted attributes:', str(e))
                #         del element[attr]                            
                # Print element name and attributes
                print(element.name, element.attrs)
                # Process children
                for child in list(element.children):
@@ -304,9 +315,6 @@ class WebScrappingStrategy(ContentScrappingStrategy):
        process_element(body)
        # # Process images using ThreadPoolExecutor
        imgs = body.find_all('img')
        with ThreadPoolExecutor() as executor: