Refactor content scrapping strategy and improve error handling
This commit is contained in:
@@ -187,7 +187,7 @@ class AsyncWebCrawler:
|
|||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
scrapping_strategy = WebScrappingStrategy()
|
scrapping_strategy = WebScrappingStrategy()
|
||||||
# result = await scrapping_strategy.ascrap(
|
# result = await scrapping_strategy.ascrap(
|
||||||
result = await scrapping_strategy.scrap(
|
result = scrapping_strategy.scrap(
|
||||||
url,
|
url,
|
||||||
html,
|
html,
|
||||||
word_count_threshold=word_count_threshold,
|
word_count_threshold=word_count_threshold,
|
||||||
|
|||||||
@@ -197,51 +197,64 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
|
|
||||||
keep_element = False
|
keep_element = False
|
||||||
|
|
||||||
if element.name == 'a' and element.get('href'):
|
social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
|
||||||
href = element['href']
|
social_media_domains = list(set(social_media_domains))
|
||||||
url_base = url.split('/')[2]
|
|
||||||
link_data = {'href': href, 'text': element.get_text()}
|
|
||||||
if href.startswith('http') and url_base not in href:
|
|
||||||
links['external'].append(link_data)
|
|
||||||
else:
|
|
||||||
links['internal'].append(link_data)
|
|
||||||
keep_element = True
|
|
||||||
|
|
||||||
if kwargs.get('exclude_external_links', True):
|
try:
|
||||||
href_url_base = href.split('/')[2]
|
if element.name == 'a' and element.get('href'):
|
||||||
if url_base not in href_url_base:
|
href = element['href']
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check if we should esclude links to all major social media platforms
|
|
||||||
if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
|
|
||||||
social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
|
|
||||||
social_media_domains = list(set(social_media_domains))
|
|
||||||
if any(domain in href for domain in social_media_domains):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
elif element.name == 'img':
|
|
||||||
# Check flag if we should remove external images
|
|
||||||
if kwargs.get('exclude_external_images', False):
|
|
||||||
src = element.get('src', '')
|
|
||||||
src_url_base = src.split('/')[2]
|
|
||||||
url_base = url.split('/')[2]
|
url_base = url.split('/')[2]
|
||||||
if url_base not in src_url_base:
|
link_data = {'href': href, 'text': element.get_text()}
|
||||||
element.decompose()
|
if href.startswith('http') and url_base not in href:
|
||||||
return False
|
links['external'].append(link_data)
|
||||||
|
else:
|
||||||
|
links['internal'].append(link_data)
|
||||||
|
keep_element = True
|
||||||
|
|
||||||
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True):
|
if kwargs.get('exclude_external_links', True):
|
||||||
src = element.get('src', '')
|
href_parts = href.split('/')
|
||||||
src_url_base = src.split('/')[2]
|
href_url_base = href_parts[2] if len(href_parts) > 2 else href
|
||||||
url_base = url.split('/')[2]
|
if url_base not in href_url_base:
|
||||||
if any(domain in src for domain in SOCIAL_MEDIA_DOMAINS):
|
element.decompose()
|
||||||
element.decompose()
|
return False
|
||||||
return False
|
|
||||||
|
|
||||||
return True # Always keep image elements
|
if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
|
||||||
|
if any(domain in href for domain in social_media_domains):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
raise "Error processing links"
|
||||||
|
|
||||||
elif element.name in ['video', 'audio']:
|
try:
|
||||||
|
if element.name == 'img':
|
||||||
|
# Check flag if we should remove external images
|
||||||
|
if kwargs.get('exclude_external_images', False):
|
||||||
|
src = element.get('src', '')
|
||||||
|
src_url_base = src.split('/')[2]
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if url_base not in src_url_base:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True):
|
||||||
|
src = element.get('src', '')
|
||||||
|
src_url_base = src.split('/')[2]
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if any(domain in src for domain in social_media_domains):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True # Always keep image elements
|
||||||
|
except Exception as e:
|
||||||
|
raise "Error processing images"
|
||||||
|
|
||||||
|
|
||||||
|
# Check if flag to remove all forms is set
|
||||||
|
if kwargs.get('remove_forms', False) and element.name == 'form':
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
if element.name in ['video', 'audio']:
|
||||||
media[f"{element.name}s"].append({
|
media[f"{element.name}s"].append({
|
||||||
'src': element.get('src'),
|
'src': element.get('src'),
|
||||||
'alt': element.get('alt'),
|
'alt': element.get('alt'),
|
||||||
@@ -262,13 +275,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
if kwargs.get('only_text', False):
|
if kwargs.get('only_text', False):
|
||||||
element.replace_with(element.get_text())
|
element.replace_with(element.get_text())
|
||||||
|
|
||||||
remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
try:
|
||||||
# for attr in element.attrs:
|
remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
||||||
# if attr not in IMPORTANT_ATTRS or (attr.startswith('data-') and not kwargs.get('keep_data_attributes', False)):
|
except Exception as e:
|
||||||
# del element[attr]
|
print('Error removing unwanted attributes:', str(e))
|
||||||
|
|
||||||
# Print element name and attributes
|
|
||||||
print(element.name, element.attrs)
|
|
||||||
|
|
||||||
# Process children
|
# Process children
|
||||||
for child in list(element.children):
|
for child in list(element.children):
|
||||||
@@ -304,9 +315,6 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
process_element(body)
|
process_element(body)
|
||||||
|
|
||||||
# # Process images using ThreadPoolExecutor
|
# # Process images using ThreadPoolExecutor
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
imgs = body.find_all('img')
|
imgs = body.find_all('img')
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
|
|||||||
Reference in New Issue
Block a user