feat: Enhance crawler flexibility and LLM extraction capabilities
- Add browser type selection (Chromium, Firefox, WebKit) - Implement iframe content extraction - Improve image processing and dimension updates - Add custom headers support in AsyncPlaywrightCrawlerStrategy - Enhance delayed content retrieval with new parameter - Optimize HTML sanitization and Markdown conversion - Update examples in quickstart_async.py for new features
This commit is contained in:
@@ -131,7 +131,7 @@ def split_and_parse_json_objects(json_string):
|
||||
return parsed_objects, unparsed_segments
|
||||
|
||||
def sanitize_html(html):
|
||||
# Replace all weird and special characters with an empty string
|
||||
# Replace all unwanted and special characters with an empty string
|
||||
sanitized_html = html
|
||||
# sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
||||
|
||||
@@ -301,7 +301,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
||||
if tag.name != 'img':
|
||||
tag.attrs = {}
|
||||
|
||||
# Extract all img tgas inti [{src: '', alt: ''}]
|
||||
# Extract all img tgas int0 [{src: '', alt: ''}]
|
||||
media = {
|
||||
'images': [],
|
||||
'videos': [],
|
||||
@@ -339,7 +339,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
||||
img.decompose()
|
||||
|
||||
|
||||
# Create a function that replace content of all"pre" tage with its inner text
|
||||
# Create a function that replace content of all"pre" tag with its inner text
|
||||
def replace_pre_tags_with_text(node):
|
||||
for child in node.find_all('pre'):
|
||||
# set child inner html to its text
|
||||
@@ -502,7 +502,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
current_tag = current_tag.parent
|
||||
# Get the text content of the parent tag
|
||||
# Get the text content from the parent tag
|
||||
if current_tag:
|
||||
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||
# Check if the text content has at least word_count_threshold
|
||||
@@ -511,88 +511,88 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
return None
|
||||
|
||||
def process_image(img, url, index, total_images):
|
||||
#Check if an image has valid display and inside undesired html elements
|
||||
def is_valid_image(img, parent, parent_classes):
|
||||
style = img.get('style', '')
|
||||
src = img.get('src', '')
|
||||
classes_to_check = ['button', 'icon', 'logo']
|
||||
tags_to_check = ['button', 'input']
|
||||
return all([
|
||||
'display:none' not in style,
|
||||
src,
|
||||
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
||||
parent.name not in tags_to_check
|
||||
])
|
||||
#Check if an image has valid display and inside undesired html elements
|
||||
def is_valid_image(img, parent, parent_classes):
|
||||
style = img.get('style', '')
|
||||
src = img.get('src', '')
|
||||
classes_to_check = ['button', 'icon', 'logo']
|
||||
tags_to_check = ['button', 'input']
|
||||
return all([
|
||||
'display:none' not in style,
|
||||
src,
|
||||
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
||||
parent.name not in tags_to_check
|
||||
])
|
||||
|
||||
#Score an image for it's usefulness
|
||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
match = re.match(r"(\d+)(\D*)", dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
#Score an image for it's usefulness
|
||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||
# Function to parse image height/width value and units
|
||||
def parse_dimension(dimension):
|
||||
if dimension:
|
||||
match = re.match(r"(\d+)(\D*)", dimension)
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||
return number, unit
|
||||
return None, None
|
||||
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
except InvalidSchema as e:
|
||||
# Fetch image file metadata to extract size and extension
|
||||
def fetch_image_file_size(img, base_url):
|
||||
#If src is relative path construct full URL, if not it may be CDN URL
|
||||
img_url = urljoin(base_url,img.get('src'))
|
||||
try:
|
||||
response = requests.head(img_url)
|
||||
if response.status_code == 200:
|
||||
return response.headers.get('Content-Length',None)
|
||||
else:
|
||||
print(f"Failed to retrieve file size for {img_url}")
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
except InvalidSchema as e:
|
||||
return None
|
||||
finally:
|
||||
return
|
||||
|
||||
image_height = img.get('height')
|
||||
height_value, height_unit = parse_dimension(image_height)
|
||||
image_width = img.get('width')
|
||||
width_value, width_unit = parse_dimension(image_width)
|
||||
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
||||
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||
# Remove . from format
|
||||
image_format = image_format.strip('.')
|
||||
score = 0
|
||||
if height_value:
|
||||
if height_unit == 'px' and height_value > 150:
|
||||
score += 1
|
||||
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
||||
score += 1
|
||||
if width_value:
|
||||
if width_unit == 'px' and width_value > 150:
|
||||
score += 1
|
||||
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
||||
score += 1
|
||||
if image_size > 10000:
|
||||
image_height = img.get('height')
|
||||
height_value, height_unit = parse_dimension(image_height)
|
||||
image_width = img.get('width')
|
||||
width_value, width_unit = parse_dimension(image_width)
|
||||
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
||||
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||
# Remove . from format
|
||||
image_format = image_format.strip('.')
|
||||
score = 0
|
||||
if height_value:
|
||||
if height_unit == 'px' and height_value > 150:
|
||||
score += 1
|
||||
if img.get('alt') != '':
|
||||
score+=1
|
||||
if any(image_format==format for format in ['jpg','png','webp']):
|
||||
score+=1
|
||||
if index/images_count<0.5:
|
||||
score+=1
|
||||
return score
|
||||
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
||||
score += 1
|
||||
if width_value:
|
||||
if width_unit == 'px' and width_value > 150:
|
||||
score += 1
|
||||
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
||||
score += 1
|
||||
if image_size > 10000:
|
||||
score += 1
|
||||
if img.get('alt') != '':
|
||||
score+=1
|
||||
if any(image_format==format for format in ['jpg','png','webp']):
|
||||
score+=1
|
||||
if index/images_count<0.5:
|
||||
score+=1
|
||||
return score
|
||||
|
||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||
return None
|
||||
score = score_image_for_usefulness(img, url, index, total_images)
|
||||
if score <= IMAGE_SCORE_THRESHOLD:
|
||||
return None
|
||||
return {
|
||||
'src': img.get('src', ''),
|
||||
'alt': img.get('alt', ''),
|
||||
'desc': find_closest_parent_with_useful_text(img),
|
||||
'score': score,
|
||||
'type': 'image'
|
||||
}
|
||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||
return None
|
||||
score = score_image_for_usefulness(img, url, index, total_images)
|
||||
if score <= IMAGE_SCORE_THRESHOLD:
|
||||
return None
|
||||
return {
|
||||
'src': img.get('src', '').replace('\\"', '"').strip(),
|
||||
'alt': img.get('alt', ''),
|
||||
'desc': find_closest_parent_with_useful_text(img),
|
||||
'score': score,
|
||||
'type': 'image'
|
||||
}
|
||||
|
||||
def process_element(element: element.PageElement) -> bool:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user