Refactored web scraping components
- Enhanced the web scraping strategy with new methods for optimized media handling. - Added new utility functions for better content processing. - Refined existing features for improved accuracy and efficiency in scraping tasks. - Introduced more robust filtering criteria for media elements.
This commit is contained in:
@@ -6,10 +6,11 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
import asyncio, requests, re, os
|
import asyncio, requests, re, os
|
||||||
from .config import *
|
from .config import *
|
||||||
from bs4 import element, NavigableString, Comment
|
from bs4 import element, NavigableString, Comment
|
||||||
|
from bs4 import PageElement, Tag
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
from requests.exceptions import InvalidSchema
|
from requests.exceptions import InvalidSchema
|
||||||
# from .content_cleaning_strategy import ContentCleaningStrategy
|
# from .content_cleaning_strategy import ContentCleaningStrategy
|
||||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, PruningContentFilter
|
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||||
from .models import MarkdownGenerationResult
|
from .models import MarkdownGenerationResult
|
||||||
from .utils import (
|
from .utils import (
|
||||||
@@ -80,45 +81,21 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||||
return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs)
|
return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def _generate_markdown_content(self,
|
def _generate_markdown_content(self,
|
||||||
cleaned_html: str,
|
cleaned_html: str,
|
||||||
html: str,
|
html: str,
|
||||||
url: str,
|
url: str,
|
||||||
success: bool,
|
success: bool,
|
||||||
**kwargs) -> Dict[str, Any]:
|
**kwargs) -> Dict[str, Any]:
|
||||||
"""Generate markdown content using either new strategy or legacy method.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cleaned_html: Sanitized HTML content
|
|
||||||
html: Original HTML content
|
|
||||||
url: Base URL of the page
|
|
||||||
success: Whether scraping was successful
|
|
||||||
**kwargs: Additional options including:
|
|
||||||
- markdown_generator: Optional[MarkdownGenerationStrategy]
|
|
||||||
- html2text: Dict[str, Any] options for HTML2Text
|
|
||||||
- content_filter: Optional[RelevantContentFilter]
|
|
||||||
- fit_markdown: bool
|
|
||||||
- fit_markdown_user_query: Optional[str]
|
|
||||||
- fit_markdown_bm25_threshold: float
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict containing markdown content in various formats
|
|
||||||
"""
|
|
||||||
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
|
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
|
||||||
|
|
||||||
if markdown_generator:
|
if markdown_generator:
|
||||||
try:
|
try:
|
||||||
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
|
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
|
||||||
markdown_generator.content_filter = PruningContentFilter(
|
markdown_generator.content_filter = BM25ContentFilter(
|
||||||
threshold_type=kwargs.get('fit_markdown_treshold_type', 'fixed'),
|
user_query=kwargs.get('fit_markdown_user_query', None),
|
||||||
threshold=kwargs.get('fit_markdown_treshold', 0.48),
|
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
||||||
min_word_threshold=kwargs.get('fit_markdown_min_word_threshold', ),
|
|
||||||
)
|
)
|
||||||
# markdown_generator.content_filter = BM25ContentFilter(
|
|
||||||
# user_query=kwargs.get('fit_markdown_user_query', None),
|
|
||||||
# bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
|
|
||||||
# )
|
|
||||||
|
|
||||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||||
cleaned_html=cleaned_html,
|
cleaned_html=cleaned_html,
|
||||||
@@ -182,13 +159,335 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
'markdown_v2' : markdown_v2
|
'markdown_v2' : markdown_v2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def flatten_nested_elements(self, node):
|
||||||
|
if isinstance(node, NavigableString):
|
||||||
|
return node
|
||||||
|
if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
|
||||||
|
return self.flatten_nested_elements(node.contents[0])
|
||||||
|
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
||||||
|
return node
|
||||||
|
|
||||||
|
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||||
|
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||||
|
current_tag = tag
|
||||||
|
while current_tag:
|
||||||
|
current_tag = current_tag.parent
|
||||||
|
# Get the text content of the parent tag
|
||||||
|
if current_tag:
|
||||||
|
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||||
|
# Check if the text content has at least word_count_threshold
|
||||||
|
if len(text_content.split()) >= image_description_min_word_threshold:
|
||||||
|
return text_content
|
||||||
|
return None
|
||||||
|
|
||||||
|
def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
|
||||||
|
attrs_to_remove = []
|
||||||
|
for attr in element.attrs:
|
||||||
|
if attr not in important_attrs:
|
||||||
|
if keep_data_attributes:
|
||||||
|
if not attr.startswith('data-'):
|
||||||
|
attrs_to_remove.append(attr)
|
||||||
|
else:
|
||||||
|
attrs_to_remove.append(attr)
|
||||||
|
|
||||||
|
for attr in attrs_to_remove:
|
||||||
|
del element[attr]
|
||||||
|
|
||||||
|
def process_image(self, img, url, index, total_images, **kwargs):
|
||||||
|
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||||
|
if ' ' in u else None}
|
||||||
|
for u in [f"http{p}" for p in s.split("http") if p]]
|
||||||
|
|
||||||
|
# Constants for checks
|
||||||
|
classes_to_check = frozenset(['button', 'icon', 'logo'])
|
||||||
|
tags_to_check = frozenset(['button', 'input'])
|
||||||
|
|
||||||
|
# Pre-fetch commonly used attributes
|
||||||
|
style = img.get('style', '')
|
||||||
|
alt = img.get('alt', '')
|
||||||
|
src = img.get('src', '')
|
||||||
|
data_src = img.get('data-src', '')
|
||||||
|
width = img.get('width')
|
||||||
|
height = img.get('height')
|
||||||
|
parent = img.parent
|
||||||
|
parent_classes = parent.get('class', [])
|
||||||
|
|
||||||
|
# Quick validation checks
|
||||||
|
if ('display:none' in style or
|
||||||
|
parent.name in tags_to_check or
|
||||||
|
any(c in cls for c in parent_classes for cls in classes_to_check) or
|
||||||
|
any(c in src for c in classes_to_check) or
|
||||||
|
any(c in alt for c in classes_to_check)):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Quick score calculation
|
||||||
|
score = 0
|
||||||
|
if width and width.isdigit():
|
||||||
|
width_val = int(width)
|
||||||
|
score += 1 if width_val > 150 else 0
|
||||||
|
if height and height.isdigit():
|
||||||
|
height_val = int(height)
|
||||||
|
score += 1 if height_val > 150 else 0
|
||||||
|
if alt:
|
||||||
|
score += 1
|
||||||
|
score += index/total_images < 0.5
|
||||||
|
|
||||||
|
image_format = ''
|
||||||
|
if "data:image/" in src:
|
||||||
|
image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
||||||
|
else:
|
||||||
|
image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
||||||
|
|
||||||
|
if image_format in ('jpg', 'png', 'webp', 'avif'):
|
||||||
|
score += 1
|
||||||
|
|
||||||
|
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Use set for deduplication
|
||||||
|
unique_urls = set()
|
||||||
|
image_variants = []
|
||||||
|
|
||||||
|
# Generate a unique group ID for this set of variants
|
||||||
|
group_id = index
|
||||||
|
|
||||||
|
# Base image info template
|
||||||
|
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||||
|
base_info = {
|
||||||
|
'alt': alt,
|
||||||
|
'desc': self.find_closest_parent_with_useful_text(img, **kwargs),
|
||||||
|
'score': score,
|
||||||
|
'type': 'image',
|
||||||
|
'group_id': group_id # Group ID for this set of variants
|
||||||
|
}
|
||||||
|
|
||||||
|
# Inline function for adding variants
|
||||||
|
def add_variant(src, width=None):
|
||||||
|
if src and not src.startswith('data:') and src not in unique_urls:
|
||||||
|
unique_urls.add(src)
|
||||||
|
image_variants.append({**base_info, 'src': src, 'width': width})
|
||||||
|
|
||||||
|
# Process all sources
|
||||||
|
add_variant(src)
|
||||||
|
add_variant(data_src)
|
||||||
|
|
||||||
|
# Handle srcset and data-srcset in one pass
|
||||||
|
for attr in ('srcset', 'data-srcset'):
|
||||||
|
if value := img.get(attr):
|
||||||
|
for source in parse_srcset(value):
|
||||||
|
add_variant(source['url'], source['width'])
|
||||||
|
|
||||||
|
# Quick picture element check
|
||||||
|
if picture := img.find_parent('picture'):
|
||||||
|
for source in picture.find_all('source'):
|
||||||
|
if srcset := source.get('srcset'):
|
||||||
|
for src in parse_srcset(srcset):
|
||||||
|
add_variant(src['url'], src['width'])
|
||||||
|
|
||||||
|
# Framework-specific attributes in one pass
|
||||||
|
for attr, value in img.attrs.items():
|
||||||
|
if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
|
||||||
|
add_variant(value)
|
||||||
|
|
||||||
|
return image_variants if image_variants else None
|
||||||
|
|
||||||
|
|
||||||
|
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||||
|
media = {'images': [], 'videos': [], 'audios': []}
|
||||||
|
internal_links_dict = {}
|
||||||
|
external_links_dict = {}
|
||||||
|
self._process_element(
|
||||||
|
url,
|
||||||
|
element,
|
||||||
|
media,
|
||||||
|
internal_links_dict,
|
||||||
|
external_links_dict,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
'media': media,
|
||||||
|
'internal_links_dict': internal_links_dict,
|
||||||
|
'external_links_dict': external_links_dict
|
||||||
|
}
|
||||||
|
|
||||||
|
def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
|
||||||
|
try:
|
||||||
|
if isinstance(element, NavigableString):
|
||||||
|
if isinstance(element, Comment):
|
||||||
|
element.extract()
|
||||||
|
return False
|
||||||
|
|
||||||
|
# if element.name == 'img':
|
||||||
|
# process_image(element, url, 0, 1)
|
||||||
|
# return True
|
||||||
|
|
||||||
|
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
keep_element = False
|
||||||
|
|
||||||
|
exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||||
|
exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||||
|
|
||||||
|
try:
|
||||||
|
if element.name == 'a' and element.get('href'):
|
||||||
|
href = element.get('href', '').strip()
|
||||||
|
if not href: # Skip empty hrefs
|
||||||
|
return False
|
||||||
|
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
|
||||||
|
# Normalize the URL
|
||||||
|
try:
|
||||||
|
normalized_href = normalize_url(href, url)
|
||||||
|
except ValueError as e:
|
||||||
|
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
link_data = {
|
||||||
|
'href': normalized_href,
|
||||||
|
'text': element.get_text().strip(),
|
||||||
|
'title': element.get('title', '').strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for duplicates and add to appropriate dictionary
|
||||||
|
is_external = is_external_url(normalized_href, url_base)
|
||||||
|
if is_external:
|
||||||
|
if normalized_href not in external_links_dict:
|
||||||
|
external_links_dict[normalized_href] = link_data
|
||||||
|
else:
|
||||||
|
if normalized_href not in internal_links_dict:
|
||||||
|
internal_links_dict[normalized_href] = link_data
|
||||||
|
|
||||||
|
keep_element = True
|
||||||
|
|
||||||
|
# Handle external link exclusions
|
||||||
|
if is_external:
|
||||||
|
if kwargs.get('exclude_external_links', False):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
elif kwargs.get('exclude_social_media_links', False):
|
||||||
|
if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
elif kwargs.get('exclude_domains', []):
|
||||||
|
if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"Error processing links: {str(e)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if element.name == 'img':
|
||||||
|
potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original']
|
||||||
|
src = element.get('src', '')
|
||||||
|
while not src and potential_sources:
|
||||||
|
src = element.get(potential_sources.pop(0), '')
|
||||||
|
if not src:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
# If it is srcset pick up the first image
|
||||||
|
if 'srcset' in element.attrs:
|
||||||
|
src = element.attrs['srcset'].split(',')[0].split(' ')[0]
|
||||||
|
|
||||||
|
# Check flag if we should remove external images
|
||||||
|
if kwargs.get('exclude_external_images', False):
|
||||||
|
src_url_base = src.split('/')[2]
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if url_base not in src_url_base:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False):
|
||||||
|
src_url_base = src.split('/')[2]
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if any(domain in src for domain in exclude_social_media_domains):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Handle exclude domains
|
||||||
|
if kwargs.get('exclude_domains', []):
|
||||||
|
if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True # Always keep image elements
|
||||||
|
except Exception as e:
|
||||||
|
raise "Error processing images"
|
||||||
|
|
||||||
|
|
||||||
|
# Check if flag to remove all forms is set
|
||||||
|
if kwargs.get('remove_forms', False) and element.name == 'form':
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
if element.name in ['video', 'audio']:
|
||||||
|
media[f"{element.name}s"].append({
|
||||||
|
'src': element.get('src'),
|
||||||
|
'alt': element.get('alt'),
|
||||||
|
'type': element.name,
|
||||||
|
'description': self.find_closest_parent_with_useful_text(element, **kwargs)
|
||||||
|
})
|
||||||
|
source_tags = element.find_all('source')
|
||||||
|
for source_tag in source_tags:
|
||||||
|
media[f"{element.name}s"].append({
|
||||||
|
'src': source_tag.get('src'),
|
||||||
|
'alt': element.get('alt'),
|
||||||
|
'type': element.name,
|
||||||
|
'description': self.find_closest_parent_with_useful_text(element, **kwargs)
|
||||||
|
})
|
||||||
|
return True # Always keep video and audio elements
|
||||||
|
|
||||||
|
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||||
|
if kwargs.get('only_text', False):
|
||||||
|
element.replace_with(element.get_text())
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
||||||
|
except Exception as e:
|
||||||
|
# print('Error removing unwanted attributes:', str(e))
|
||||||
|
self._log('error',
|
||||||
|
message="Error removing unwanted attributes: {error}",
|
||||||
|
tag="SCRAPE",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
# Process children
|
||||||
|
for child in list(element.children):
|
||||||
|
if isinstance(child, NavigableString) and not isinstance(child, Comment):
|
||||||
|
if len(child.strip()) > 0:
|
||||||
|
keep_element = True
|
||||||
|
else:
|
||||||
|
if self._process_element(url, child, media, internal_links_dict, external_links_dict, **kwargs):
|
||||||
|
keep_element = True
|
||||||
|
|
||||||
|
|
||||||
|
# Check word count
|
||||||
|
word_count_threshold = kwargs.get('word_count_threshold', MIN_WORD_THRESHOLD)
|
||||||
|
if not keep_element:
|
||||||
|
word_count = len(element.get_text(strip=True).split())
|
||||||
|
keep_element = word_count >= word_count_threshold
|
||||||
|
|
||||||
|
if not keep_element:
|
||||||
|
element.decompose()
|
||||||
|
|
||||||
|
return keep_element
|
||||||
|
except Exception as e:
|
||||||
|
# print('Error processing element:', str(e))
|
||||||
|
self._log('error',
|
||||||
|
message="Error processing element: {error}",
|
||||||
|
tag="SCRAPE",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
||||||
success = True
|
success = True
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
soup = BeautifulSoup(html, 'lxml')
|
soup = BeautifulSoup(html, 'lxml')
|
||||||
body = soup.body
|
body = soup.body
|
||||||
|
|
||||||
@@ -200,15 +499,24 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
tag="SCRAPE",
|
tag="SCRAPE",
|
||||||
params={"error": str(e)}
|
params={"error": str(e)}
|
||||||
)
|
)
|
||||||
# print('Error extracting metadata:', str(e))
|
|
||||||
meta = {}
|
meta = {}
|
||||||
|
|
||||||
|
# Handle tag-based removal first - faster than CSS selection
|
||||||
|
excluded_tags = set(kwargs.get('excluded_tags', []) or [])
|
||||||
|
if excluded_tags:
|
||||||
|
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
||||||
|
element.extract()
|
||||||
|
|
||||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
# Handle CSS selector-based removal
|
||||||
|
excluded_selector = kwargs.get('excluded_selector', '')
|
||||||
for tag in kwargs.get('excluded_tags', []) or []:
|
if excluded_selector:
|
||||||
for el in body.select(tag):
|
is_single_selector = ',' not in excluded_selector and ' ' not in excluded_selector
|
||||||
el.decompose()
|
if is_single_selector:
|
||||||
|
while element := body.select_one(excluded_selector):
|
||||||
|
element.extract()
|
||||||
|
else:
|
||||||
|
for element in body.select(excluded_selector):
|
||||||
|
element.extract()
|
||||||
|
|
||||||
if css_selector:
|
if css_selector:
|
||||||
selected_elements = body.select(css_selector)
|
selected_elements = body.select(css_selector)
|
||||||
@@ -227,384 +535,17 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
for el in selected_elements:
|
for el in selected_elements:
|
||||||
body.append(el)
|
body.append(el)
|
||||||
|
|
||||||
links = {'internal': [], 'external': []}
|
result_obj = self.process_element(
|
||||||
media = {'images': [], 'videos': [], 'audios': []}
|
url,
|
||||||
internal_links_dict = {}
|
body,
|
||||||
external_links_dict = {}
|
word_count_threshold = word_count_threshold,
|
||||||
|
**kwargs
|
||||||
# Extract meaningful text for media files from closest parent
|
)
|
||||||
def find_closest_parent_with_useful_text(tag):
|
|
||||||
current_tag = tag
|
|
||||||
while current_tag:
|
|
||||||
current_tag = current_tag.parent
|
|
||||||
# Get the text content of the parent tag
|
|
||||||
if current_tag:
|
|
||||||
text_content = current_tag.get_text(separator=' ',strip=True)
|
|
||||||
# Check if the text content has at least word_count_threshold
|
|
||||||
if len(text_content.split()) >= image_description_min_word_threshold:
|
|
||||||
return text_content
|
|
||||||
return None
|
|
||||||
|
|
||||||
def process_image_old(img, url, index, total_images):
|
|
||||||
|
|
||||||
|
|
||||||
#Check if an image has valid display and inside undesired html elements
|
|
||||||
def is_valid_image(img, parent, parent_classes):
|
|
||||||
style = img.get('style', '')
|
|
||||||
src = img.get('src', '')
|
|
||||||
classes_to_check = ['button', 'icon', 'logo']
|
|
||||||
tags_to_check = ['button', 'input']
|
|
||||||
return all([
|
|
||||||
'display:none' not in style,
|
|
||||||
src,
|
|
||||||
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
|
||||||
parent.name not in tags_to_check
|
|
||||||
])
|
|
||||||
|
|
||||||
#Score an image for it's usefulness
|
|
||||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
|
||||||
image_height = img.get('height')
|
|
||||||
height_value, height_unit = parse_dimension(image_height)
|
|
||||||
image_width = img.get('width')
|
|
||||||
width_value, width_unit = parse_dimension(image_width)
|
|
||||||
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
|
||||||
image_src = img.get('src','')
|
|
||||||
if "data:image/" in image_src:
|
|
||||||
image_format = image_src.split(',')[0].split(';')[0].split('/')[1]
|
|
||||||
else:
|
|
||||||
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
|
||||||
# Remove . from format
|
|
||||||
image_format = image_format.strip('.').split('?')[0]
|
|
||||||
score = 0
|
|
||||||
if height_value:
|
|
||||||
if height_unit == 'px' and height_value > 150:
|
|
||||||
score += 1
|
|
||||||
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
|
||||||
score += 1
|
|
||||||
if width_value:
|
|
||||||
if width_unit == 'px' and width_value > 150:
|
|
||||||
score += 1
|
|
||||||
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
|
||||||
score += 1
|
|
||||||
if image_size > 10000:
|
|
||||||
score += 1
|
|
||||||
if img.get('alt') != '':
|
|
||||||
score+=1
|
|
||||||
if any(image_format==format for format in ['jpg','png','webp']):
|
|
||||||
score+=1
|
|
||||||
if index/images_count<0.5:
|
|
||||||
score+=1
|
|
||||||
return score
|
|
||||||
|
|
||||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
|
||||||
return None
|
|
||||||
|
|
||||||
score = score_image_for_usefulness(img, url, index, total_images)
|
|
||||||
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
|
|
||||||
return None
|
|
||||||
|
|
||||||
base_result = {
|
|
||||||
'src': img.get('src', ''),
|
|
||||||
'data-src': img.get('data-src', ''),
|
|
||||||
'alt': img.get('alt', ''),
|
|
||||||
'desc': find_closest_parent_with_useful_text(img),
|
|
||||||
'score': score,
|
|
||||||
'type': 'image'
|
|
||||||
}
|
|
||||||
|
|
||||||
sources = []
|
|
||||||
srcset = img.get('srcset', '')
|
|
||||||
if srcset:
|
|
||||||
sources = parse_srcset(srcset)
|
|
||||||
if sources:
|
|
||||||
return [dict(base_result, src=source['url'], width=source['width'])
|
|
||||||
for source in sources]
|
|
||||||
|
|
||||||
return [base_result] # Always return a list
|
|
||||||
|
|
||||||
def process_image(img, url, index, total_images):
|
|
||||||
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
|
||||||
if ' ' in u else None}
|
|
||||||
for u in [f"http{p}" for p in s.split("http") if p]]
|
|
||||||
|
|
||||||
# Constants for checks
|
|
||||||
classes_to_check = frozenset(['button', 'icon', 'logo'])
|
|
||||||
tags_to_check = frozenset(['button', 'input'])
|
|
||||||
|
|
||||||
# Pre-fetch commonly used attributes
|
|
||||||
style = img.get('style', '')
|
|
||||||
alt = img.get('alt', '')
|
|
||||||
src = img.get('src', '')
|
|
||||||
data_src = img.get('data-src', '')
|
|
||||||
width = img.get('width')
|
|
||||||
height = img.get('height')
|
|
||||||
parent = img.parent
|
|
||||||
parent_classes = parent.get('class', [])
|
|
||||||
|
|
||||||
# Quick validation checks
|
|
||||||
if ('display:none' in style or
|
|
||||||
parent.name in tags_to_check or
|
|
||||||
any(c in cls for c in parent_classes for cls in classes_to_check) or
|
|
||||||
any(c in src for c in classes_to_check) or
|
|
||||||
any(c in alt for c in classes_to_check)):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Quick score calculation
|
|
||||||
score = 0
|
|
||||||
if width and width.isdigit():
|
|
||||||
width_val = int(width)
|
|
||||||
score += 1 if width_val > 150 else 0
|
|
||||||
if height and height.isdigit():
|
|
||||||
height_val = int(height)
|
|
||||||
score += 1 if height_val > 150 else 0
|
|
||||||
if alt:
|
|
||||||
score += 1
|
|
||||||
score += index/total_images < 0.5
|
|
||||||
|
|
||||||
image_format = ''
|
|
||||||
if "data:image/" in src:
|
|
||||||
image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
|
||||||
else:
|
|
||||||
image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
|
||||||
|
|
||||||
if image_format in ('jpg', 'png', 'webp', 'avif'):
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Use set for deduplication
|
|
||||||
unique_urls = set()
|
|
||||||
image_variants = []
|
|
||||||
|
|
||||||
# Generate a unique group ID for this set of variants
|
|
||||||
group_id = index
|
|
||||||
|
|
||||||
# Base image info template
|
|
||||||
base_info = {
|
|
||||||
'alt': alt,
|
|
||||||
'desc': find_closest_parent_with_useful_text(img),
|
|
||||||
'score': score,
|
|
||||||
'type': 'image',
|
|
||||||
'group_id': group_id # Group ID for this set of variants
|
|
||||||
}
|
|
||||||
|
|
||||||
# Inline function for adding variants
|
|
||||||
def add_variant(src, width=None):
|
|
||||||
if src and not src.startswith('data:') and src not in unique_urls:
|
|
||||||
unique_urls.add(src)
|
|
||||||
image_variants.append({**base_info, 'src': src, 'width': width})
|
|
||||||
|
|
||||||
# Process all sources
|
|
||||||
add_variant(src)
|
|
||||||
add_variant(data_src)
|
|
||||||
|
|
||||||
# Handle srcset and data-srcset in one pass
|
|
||||||
for attr in ('srcset', 'data-srcset'):
|
|
||||||
if value := img.get(attr):
|
|
||||||
for source in parse_srcset(value):
|
|
||||||
add_variant(source['url'], source['width'])
|
|
||||||
|
|
||||||
# Quick picture element check
|
|
||||||
if picture := img.find_parent('picture'):
|
|
||||||
for source in picture.find_all('source'):
|
|
||||||
if srcset := source.get('srcset'):
|
|
||||||
for src in parse_srcset(srcset):
|
|
||||||
add_variant(src['url'], src['width'])
|
|
||||||
|
|
||||||
# Framework-specific attributes in one pass
|
|
||||||
for attr, value in img.attrs.items():
|
|
||||||
if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value:
|
|
||||||
add_variant(value)
|
|
||||||
|
|
||||||
return image_variants if image_variants else None
|
|
||||||
|
|
||||||
def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False):
|
|
||||||
attrs_to_remove = []
|
|
||||||
for attr in element.attrs:
|
|
||||||
if attr not in important_attrs:
|
|
||||||
if keep_data_attributes:
|
|
||||||
if not attr.startswith('data-'):
|
|
||||||
attrs_to_remove.append(attr)
|
|
||||||
else:
|
|
||||||
attrs_to_remove.append(attr)
|
|
||||||
|
|
||||||
for attr in attrs_to_remove:
|
|
||||||
del element[attr]
|
|
||||||
|
|
||||||
def process_element(element: element.PageElement) -> bool:
|
links = {'internal': [], 'external': []}
|
||||||
try:
|
media = result_obj['media']
|
||||||
if isinstance(element, NavigableString):
|
internal_links_dict = result_obj['internal_links_dict']
|
||||||
if isinstance(element, Comment):
|
external_links_dict = result_obj['external_links_dict']
|
||||||
element.extract()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# if element.name == 'img':
|
|
||||||
# process_image(element, url, 0, 1)
|
|
||||||
# return True
|
|
||||||
|
|
||||||
if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
keep_element = False
|
|
||||||
|
|
||||||
exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
|
||||||
exclude_social_media_domains = list(set(exclude_social_media_domains))
|
|
||||||
|
|
||||||
try:
|
|
||||||
if element.name == 'a' and element.get('href'):
|
|
||||||
href = element.get('href', '').strip()
|
|
||||||
if not href: # Skip empty hrefs
|
|
||||||
return False
|
|
||||||
|
|
||||||
url_base = url.split('/')[2]
|
|
||||||
|
|
||||||
# Normalize the URL
|
|
||||||
try:
|
|
||||||
normalized_href = normalize_url(href, url)
|
|
||||||
except ValueError as e:
|
|
||||||
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
link_data = {
|
|
||||||
'href': normalized_href,
|
|
||||||
'text': element.get_text().strip(),
|
|
||||||
'title': element.get('title', '').strip()
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check for duplicates and add to appropriate dictionary
|
|
||||||
is_external = is_external_url(normalized_href, url_base)
|
|
||||||
if is_external:
|
|
||||||
if normalized_href not in external_links_dict:
|
|
||||||
external_links_dict[normalized_href] = link_data
|
|
||||||
else:
|
|
||||||
if normalized_href not in internal_links_dict:
|
|
||||||
internal_links_dict[normalized_href] = link_data
|
|
||||||
|
|
||||||
keep_element = True
|
|
||||||
|
|
||||||
# Handle external link exclusions
|
|
||||||
if is_external:
|
|
||||||
if kwargs.get('exclude_external_links', False):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
elif kwargs.get('exclude_social_media_links', False):
|
|
||||||
if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
elif kwargs.get('exclude_domains', []):
|
|
||||||
if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise Exception(f"Error processing links: {str(e)}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
if element.name == 'img':
|
|
||||||
potential_sources = ['src', 'data-src', 'srcset' 'data-lazy-src', 'data-original']
|
|
||||||
src = element.get('src', '')
|
|
||||||
while not src and potential_sources:
|
|
||||||
src = element.get(potential_sources.pop(0), '')
|
|
||||||
if not src:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# If it is srcset pick up the first image
|
|
||||||
if 'srcset' in element.attrs:
|
|
||||||
src = element.attrs['srcset'].split(',')[0].split(' ')[0]
|
|
||||||
|
|
||||||
# Check flag if we should remove external images
|
|
||||||
if kwargs.get('exclude_external_images', False):
|
|
||||||
src_url_base = src.split('/')[2]
|
|
||||||
url_base = url.split('/')[2]
|
|
||||||
if url_base not in src_url_base:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False):
|
|
||||||
src_url_base = src.split('/')[2]
|
|
||||||
url_base = url.split('/')[2]
|
|
||||||
if any(domain in src for domain in exclude_social_media_domains):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Handle exclude domains
|
|
||||||
if kwargs.get('exclude_domains', []):
|
|
||||||
if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True # Always keep image elements
|
|
||||||
except Exception as e:
|
|
||||||
raise "Error processing images"
|
|
||||||
|
|
||||||
|
|
||||||
# Check if flag to remove all forms is set
|
|
||||||
if kwargs.get('remove_forms', False) and element.name == 'form':
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
if element.name in ['video', 'audio']:
|
|
||||||
media[f"{element.name}s"].append({
|
|
||||||
'src': element.get('src'),
|
|
||||||
'alt': element.get('alt'),
|
|
||||||
'type': element.name,
|
|
||||||
'description': find_closest_parent_with_useful_text(element)
|
|
||||||
})
|
|
||||||
source_tags = element.find_all('source')
|
|
||||||
for source_tag in source_tags:
|
|
||||||
media[f"{element.name}s"].append({
|
|
||||||
'src': source_tag.get('src'),
|
|
||||||
'alt': element.get('alt'),
|
|
||||||
'type': element.name,
|
|
||||||
'description': find_closest_parent_with_useful_text(element)
|
|
||||||
})
|
|
||||||
return True # Always keep video and audio elements
|
|
||||||
|
|
||||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
|
||||||
if kwargs.get('only_text', False):
|
|
||||||
element.replace_with(element.get_text())
|
|
||||||
|
|
||||||
try:
|
|
||||||
remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
|
||||||
except Exception as e:
|
|
||||||
# print('Error removing unwanted attributes:', str(e))
|
|
||||||
self._log('error',
|
|
||||||
message="Error removing unwanted attributes: {error}",
|
|
||||||
tag="SCRAPE",
|
|
||||||
params={"error": str(e)}
|
|
||||||
)
|
|
||||||
# Process children
|
|
||||||
for child in list(element.children):
|
|
||||||
if isinstance(child, NavigableString) and not isinstance(child, Comment):
|
|
||||||
if len(child.strip()) > 0:
|
|
||||||
keep_element = True
|
|
||||||
else:
|
|
||||||
if process_element(child):
|
|
||||||
keep_element = True
|
|
||||||
|
|
||||||
|
|
||||||
# Check word count
|
|
||||||
if not keep_element:
|
|
||||||
word_count = len(element.get_text(strip=True).split())
|
|
||||||
keep_element = word_count >= word_count_threshold
|
|
||||||
|
|
||||||
if not keep_element:
|
|
||||||
element.decompose()
|
|
||||||
|
|
||||||
return keep_element
|
|
||||||
except Exception as e:
|
|
||||||
# print('Error processing element:', str(e))
|
|
||||||
self._log('error',
|
|
||||||
message="Error processing element: {error}",
|
|
||||||
tag="SCRAPE",
|
|
||||||
params={"error": str(e)}
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
process_element(body)
|
|
||||||
|
|
||||||
# Update the links dictionary with unique links
|
# Update the links dictionary with unique links
|
||||||
links['internal'] = list(internal_links_dict.values())
|
links['internal'] = list(internal_links_dict.values())
|
||||||
@@ -613,23 +554,14 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
# # Process images using ThreadPoolExecutor
|
# # Process images using ThreadPoolExecutor
|
||||||
imgs = body.find_all('img')
|
imgs = body.find_all('img')
|
||||||
|
|
||||||
# For test we use for loop instead of thread
|
|
||||||
media['images'] = [
|
media['images'] = [
|
||||||
img for result in (process_image(img, url, i, len(imgs))
|
img for result in (self.process_image(img, url, i, len(imgs))
|
||||||
for i, img in enumerate(imgs))
|
for i, img in enumerate(imgs))
|
||||||
if result is not None
|
if result is not None
|
||||||
for img in result
|
for img in result
|
||||||
]
|
]
|
||||||
|
|
||||||
def flatten_nested_elements(node):
|
body = self.flatten_nested_elements(body)
|
||||||
if isinstance(node, NavigableString):
|
|
||||||
return node
|
|
||||||
if len(node.contents) == 1 and isinstance(node.contents[0], element.Tag) and node.contents[0].name == node.name:
|
|
||||||
return flatten_nested_elements(node.contents[0])
|
|
||||||
node.contents = [flatten_nested_elements(child) for child in node.contents]
|
|
||||||
return node
|
|
||||||
|
|
||||||
body = flatten_nested_elements(body)
|
|
||||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
src = img.get('src', '')
|
src = img.get('src', '')
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import textwrap
|
|||||||
|
|
||||||
from .html2text import HTML2Text
|
from .html2text import HTML2Text
|
||||||
class CustomHTML2Text(HTML2Text):
|
class CustomHTML2Text(HTML2Text):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, handle_code_in_pre=False, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.inside_pre = False
|
self.inside_pre = False
|
||||||
self.inside_code = False
|
self.inside_code = False
|
||||||
@@ -30,6 +30,7 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
self.current_preserved_tag = None
|
self.current_preserved_tag = None
|
||||||
self.preserved_content = []
|
self.preserved_content = []
|
||||||
self.preserve_depth = 0
|
self.preserve_depth = 0
|
||||||
|
self.handle_code_in_pre = handle_code_in_pre
|
||||||
|
|
||||||
# Configuration options
|
# Configuration options
|
||||||
self.skip_internal_links = False
|
self.skip_internal_links = False
|
||||||
@@ -50,6 +51,8 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
if key == 'preserve_tags':
|
if key == 'preserve_tags':
|
||||||
self.preserve_tags = set(value)
|
self.preserve_tags = set(value)
|
||||||
|
elif key == 'handle_code_in_pre':
|
||||||
|
self.handle_code_in_pre = value
|
||||||
else:
|
else:
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
@@ -88,13 +91,21 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
# Handle pre tags
|
# Handle pre tags
|
||||||
if tag == 'pre':
|
if tag == 'pre':
|
||||||
if start:
|
if start:
|
||||||
self.o('```\n')
|
self.o('```\n') # Markdown code block start
|
||||||
self.inside_pre = True
|
self.inside_pre = True
|
||||||
else:
|
else:
|
||||||
self.o('\n```')
|
self.o('\n```\n') # Markdown code block end
|
||||||
self.inside_pre = False
|
self.inside_pre = False
|
||||||
# elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
elif tag == 'code':
|
||||||
# pass
|
if self.inside_pre and not self.handle_code_in_pre:
|
||||||
|
# Ignore code tags inside pre blocks if handle_code_in_pre is False
|
||||||
|
return
|
||||||
|
if start:
|
||||||
|
self.o('`') # Markdown inline code start
|
||||||
|
self.inside_code = True
|
||||||
|
else:
|
||||||
|
self.o('`') # Markdown inline code end
|
||||||
|
self.inside_code = False
|
||||||
else:
|
else:
|
||||||
super().handle_tag(tag, attrs, start)
|
super().handle_tag(tag, attrs, start)
|
||||||
|
|
||||||
@@ -103,7 +114,39 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
if self.preserve_depth > 0:
|
if self.preserve_depth > 0:
|
||||||
self.preserved_content.append(data)
|
self.preserved_content.append(data)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if self.inside_pre:
|
||||||
|
# Output the raw content for pre blocks, including content inside code tags
|
||||||
|
self.o(data) # Directly output the data as-is (preserve newlines)
|
||||||
|
return
|
||||||
|
if self.inside_code:
|
||||||
|
# Inline code: no newlines allowed
|
||||||
|
self.o(data.replace('\n', ' '))
|
||||||
|
return
|
||||||
|
|
||||||
|
# Default behavior for other tags
|
||||||
super().handle_data(data, entity_char)
|
super().handle_data(data, entity_char)
|
||||||
|
|
||||||
|
|
||||||
|
# # Handle pre tags
|
||||||
|
# if tag == 'pre':
|
||||||
|
# if start:
|
||||||
|
# self.o('```\n')
|
||||||
|
# self.inside_pre = True
|
||||||
|
# else:
|
||||||
|
# self.o('\n```')
|
||||||
|
# self.inside_pre = False
|
||||||
|
# # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||||
|
# # pass
|
||||||
|
# else:
|
||||||
|
# super().handle_tag(tag, attrs, start)
|
||||||
|
|
||||||
|
# def handle_data(self, data, entity_char=False):
|
||||||
|
# """Override handle_data to capture content within preserved tags."""
|
||||||
|
# if self.preserve_depth > 0:
|
||||||
|
# self.preserved_content.append(data)
|
||||||
|
# return
|
||||||
|
# super().handle_data(data, entity_char)
|
||||||
class InvalidCSSSelectorError(Exception):
|
class InvalidCSSSelectorError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
0
crawl4ai/utils.scraping.py
Normal file
0
crawl4ai/utils.scraping.py
Normal file
@@ -547,6 +547,7 @@ async def generate_knowledge_graph():
|
|||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
|
|
||||||
async def fit_markdown_remove_overlay():
|
async def fit_markdown_remove_overlay():
|
||||||
|
|
||||||
async with AsyncWebCrawler(
|
async with AsyncWebCrawler(
|
||||||
headless=True, # Set to False to see what is happening
|
headless=True, # Set to False to see what is happening
|
||||||
verbose=True,
|
verbose=True,
|
||||||
@@ -560,13 +561,15 @@ async def fit_markdown_remove_overlay():
|
|||||||
url='https://www.kidocode.com/degrees/technology',
|
url='https://www.kidocode.com/degrees/technology',
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
markdown_generator=DefaultMarkdownGenerator(
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),
|
content_filter=PruningContentFilter(
|
||||||
|
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||||
|
),
|
||||||
options={
|
options={
|
||||||
"ignore_links": True
|
"ignore_links": True
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
# markdown_generator=DefaultMarkdownGenerator(
|
# markdown_generator=DefaultMarkdownGenerator(
|
||||||
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0),
|
# content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
|
||||||
# options={
|
# options={
|
||||||
# "ignore_links": True
|
# "ignore_links": True
|
||||||
# }
|
# }
|
||||||
|
|||||||
Reference in New Issue
Block a user