Commit Message:

- Added examples for Amazon product data extraction methods
  - Updated configuration options and enhance documentation
  - Minor refactoring for improved performance and readability
  - Cleaned up version control settings.
This commit is contained in:
UncleCode
2024-12-29 20:05:18 +08:00
parent f2d9912697
commit fb33a24891
27 changed files with 4371 additions and 1408 deletions

View File

@@ -64,6 +64,17 @@ class ContentScrapingStrategy(ABC):
pass
class WebScrapingStrategy(ContentScrapingStrategy):
"""
Class for web content scraping. Perhaps the most important class.
How it works:
1. Extract content from HTML using BeautifulSoup.
2. Clean the extracted content using a content cleaning strategy.
3. Filter the cleaned content using a content filtering strategy.
4. Generate markdown content from the filtered content.
5. Return the markdown content.
"""
def __init__(self, logger=None):
self.logger = logger
@@ -74,17 +85,57 @@ class WebScrapingStrategy(ContentScrapingStrategy):
log_method(message=message, tag=tag, **kwargs)
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
"""
Main entry point for content scraping.
Args:
url (str): The URL of the page to scrape.
html (str): The HTML content of the page.
**kwargs: Additional keyword arguments.
Returns:
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
"""
return self._scrap(url, html, is_async=False, **kwargs)
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
"""
Main entry point for asynchronous content scraping.
Args:
url (str): The URL of the page to scrape.
html (str): The HTML content of the page.
**kwargs: Additional keyword arguments.
Returns:
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
"""
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
def _generate_markdown_content(self,
cleaned_html: str,
html: str,
url: str,
success: bool,
**kwargs) -> Dict[str, Any]:
def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
"""
Generate markdown content from cleaned HTML.
Args:
cleaned_html (str): The cleaned HTML content.
html (str): The original HTML content.
url (str): The URL of the page.
success (bool): Whether the content was successfully cleaned.
**kwargs: Additional keyword arguments.
Returns:
Dict[str, Any]: A dictionary containing the generated markdown content.
"""
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
if markdown_generator:
@@ -158,6 +209,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
"""
def flatten_nested_elements(self, node):
"""
Flatten nested elements in a HTML tree.
Args:
node (Tag): The root node of the HTML tree.
Returns:
Tag: The flattened HTML tree.
"""
if isinstance(node, NavigableString):
return node
if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
@@ -166,6 +226,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return node
def find_closest_parent_with_useful_text(self, tag, **kwargs):
"""
Find the closest parent with useful text.
Args:
tag (Tag): The starting tag to search from.
**kwargs: Additional keyword arguments.
Returns:
Tag: The closest parent with useful text, or None if not found.
"""
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
current_tag = tag
while current_tag:
@@ -179,6 +249,17 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return None
def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
"""
Remove unwanted attributes from an HTML element.
Args:
element (Tag): The HTML element to remove attributes from.
important_attrs (list): List of important attributes to keep.
keep_data_attributes (bool): Whether to keep data attributes.
Returns:
None
"""
attrs_to_remove = []
for attr in element.attrs:
if attr not in important_attrs:
@@ -192,6 +273,26 @@ class WebScrapingStrategy(ContentScrapingStrategy):
del element[attr]
def process_image(self, img, url, index, total_images, **kwargs):
"""
Process an image element.
How it works:
1. Check if the image has valid display and inside undesired html elements.
2. Score an image for it's usefulness.
3. Extract image file metadata to extract size and extension.
4. Generate a dictionary with the processed image information.
5. Return the processed image information.
Args:
img (Tag): The image element to process.
url (str): The URL of the page containing the image.
index (int): The index of the image in the list of images.
total_images (int): The total number of images in the list.
**kwargs: Additional keyword arguments.
Returns:
dict: A dictionary containing the processed image information.
"""
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
if ' ' in u else None}
for u in [f"http{p}" for p in s.split("http") if p]]
@@ -316,6 +417,23 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return image_variants if image_variants else None
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
"""
Process an HTML element.
How it works:
1. Check if the element is an image, video, or audio.
2. Extract the element's attributes and content.
3. Process the element based on its type.
4. Return the processed element information.
Args:
url (str): The URL of the page containing the element.
element (Tag): The HTML element to process.
**kwargs: Additional keyword arguments.
Returns:
dict: A dictionary containing the processed element information.
"""
media = {'images': [], 'videos': [], 'audios': []}
internal_links_dict = {}
external_links_dict = {}
@@ -334,6 +452,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
}
def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
"""
Process an HTML element.
"""
try:
if isinstance(element, NavigableString):
if isinstance(element, Comment):
@@ -534,11 +655,25 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return False
def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
"""
Extract content from HTML using BeautifulSoup.
Args:
url (str): The URL of the page to scrape.
html (str): The HTML content of the page to scrape.
word_count_threshold (int): The minimum word count threshold for content extraction.
css_selector (str): The CSS selector to use for content extraction.
**kwargs: Additional keyword arguments.
Returns:
dict: A dictionary containing the extracted content.
"""
success = True
if not html:
return None
soup = BeautifulSoup(html, 'lxml')
parser_type = kwargs.get('parser', 'lxml')
soup = BeautifulSoup(html, parser_type)
body = soup.body
base_domain = get_base_domain(url)