feat: Enhance crawler flexibility and LLM extraction capabilities

- Add browser type selection (Chromium, Firefox, WebKit)
- Implement iframe content extraction
- Improve image processing and dimension updates
- Add custom headers support in AsyncPlaywrightCrawlerStrategy
- Enhance delayed content retrieval with new parameter
- Optimize HTML sanitization and Markdown conversion
- Update examples in quickstart_async.py for new features
This commit is contained in:
unclecode
2024-10-14 21:03:28 +08:00
parent b9bbd42373
commit 320afdea64
7 changed files with 238 additions and 93 deletions

View File

@@ -16,8 +16,6 @@ from .utils import (
CustomHTML2Text
)
class ContentScrappingStrategy(ABC):
@abstractmethod
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
@@ -129,7 +127,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
image_format = os.path.splitext(img.get('src',''))[1].lower()
# Remove . from format
image_format = image_format.strip('.')
image_format = image_format.strip('.').split('?')[0]
score = 0
if height_value:
if height_unit == 'px' and height_value > 150:
@@ -158,6 +156,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
return None
return {
'src': img.get('src', ''),
'data-src': img.get('data-src', ''),
'alt': img.get('alt', ''),
'desc': find_closest_parent_with_useful_text(img),
'score': score,
@@ -275,11 +274,14 @@ class WebScrappingStrategy(ContentScrappingStrategy):
# Replace base64 data with empty string
img['src'] = base64_pattern.sub('', src)
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = sanitize_html(cleaned_html)
h = CustomHTML2Text()
h.ignore_links = True
markdown = h.handle(cleaned_html)
h.body_width = 0
try:
markdown = h.handle(cleaned_html)
except Exception as e:
markdown = h.handle(sanitize_html(cleaned_html))
markdown = markdown.replace(' ```', '```')
try:
@@ -288,6 +290,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
print('Error extracting metadata:', str(e))
meta = {}
cleaned_html = sanitize_html(cleaned_html)
return {
'markdown': markdown,
'cleaned_html': cleaned_html,