Commit Message:
- Added examples for Amazon product data extraction methods - Updated configuration options and enhance documentation - Minor refactoring for improved performance and readability - Cleaned up version control settings.
This commit is contained in:
@@ -11,6 +11,7 @@ from .user_agent_generator import UserAgentGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from typing import Union, List
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
@@ -39,8 +40,8 @@ class BrowserConfig:
|
||||
Default: None.
|
||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1920.
|
||||
viewport_height (int): Default viewport height for pages. Default: 1080.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||
verbose (bool): Enable verbose logging.
|
||||
Default: True.
|
||||
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
|
||||
@@ -79,7 +80,7 @@ class BrowserConfig:
|
||||
chrome_channel: str = "chrome",
|
||||
proxy: str = None,
|
||||
proxy_config: dict = None,
|
||||
viewport_width: int = 800,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
accept_downloads: bool = False,
|
||||
downloads_path: str = None,
|
||||
@@ -136,10 +137,15 @@ class BrowserConfig:
|
||||
self.debugging_port = debugging_port
|
||||
|
||||
user_agenr_generator = UserAgentGenerator()
|
||||
if self.user_agent_mode != "random":
|
||||
if self.user_agent_mode != "random" and self.user_agent_generator_config:
|
||||
self.user_agent = user_agenr_generator.generate(
|
||||
**(self.user_agent_generator_config or {})
|
||||
)
|
||||
elif self.user_agent_mode == "random":
|
||||
self.user_agent = user_agenr_generator.generate()
|
||||
else:
|
||||
pass
|
||||
|
||||
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
||||
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
||||
|
||||
@@ -158,8 +164,8 @@ class BrowserConfig:
|
||||
chrome_channel=kwargs.get("chrome_channel", "chrome"),
|
||||
proxy=kwargs.get("proxy"),
|
||||
proxy_config=kwargs.get("proxy_config"),
|
||||
viewport_width=kwargs.get("viewport_width", 1920),
|
||||
viewport_height=kwargs.get("viewport_height", 1080),
|
||||
viewport_width=kwargs.get("viewport_width", 1080),
|
||||
viewport_height=kwargs.get("viewport_height", 600),
|
||||
accept_downloads=kwargs.get("accept_downloads", False),
|
||||
downloads_path=kwargs.get("downloads_path"),
|
||||
storage_state=kwargs.get("storage_state"),
|
||||
@@ -215,6 +221,8 @@ class CrawlerRunConfig:
|
||||
Default: False.
|
||||
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
|
||||
Default: False.
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
|
||||
# Caching Parameters
|
||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||
@@ -322,6 +330,7 @@ class CrawlerRunConfig:
|
||||
keep_data_attributes: bool = False,
|
||||
remove_forms: bool = False,
|
||||
prettiify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
@@ -345,7 +354,7 @@ class CrawlerRunConfig:
|
||||
semaphore_count: int = 5,
|
||||
|
||||
# Page Interaction Parameters
|
||||
js_code=None,
|
||||
js_code: Union[str, List[str]] = None,
|
||||
js_only: bool = False,
|
||||
ignore_body_visibility: bool = True,
|
||||
scan_full_page: bool = False,
|
||||
@@ -393,6 +402,7 @@ class CrawlerRunConfig:
|
||||
self.keep_data_attributes = keep_data_attributes
|
||||
self.remove_forms = remove_forms
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
|
||||
# SSL Parameters
|
||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||
@@ -478,6 +488,7 @@ class CrawlerRunConfig:
|
||||
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
||||
remove_forms=kwargs.get("remove_forms", False),
|
||||
prettiify=kwargs.get("prettiify", False),
|
||||
parser_type=kwargs.get("parser_type", "lxml"),
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||
@@ -550,6 +561,7 @@ class CrawlerRunConfig:
|
||||
"keep_data_attributes": self.keep_data_attributes,
|
||||
"remove_forms": self.remove_forms,
|
||||
"prettiify": self.prettiify,
|
||||
"parser_type": self.parser_type,
|
||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||
"cache_mode": self.cache_mode,
|
||||
"session_id": self.session_id,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,7 +7,7 @@ from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
from .utils import ensure_content_dirs, generate_content_hash
|
||||
from .models import CrawlResult
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
import xxhash
|
||||
import aiofiles
|
||||
from .config import NEED_MIGRATION
|
||||
@@ -295,13 +295,18 @@ class AsyncDatabaseManager:
|
||||
row_dict[field] = ""
|
||||
|
||||
# Parse JSON fields
|
||||
json_fields = ['media', 'links', 'metadata', 'response_headers']
|
||||
json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
|
||||
for field in json_fields:
|
||||
try:
|
||||
row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
|
||||
except json.JSONDecodeError:
|
||||
row_dict[field] = {}
|
||||
|
||||
if isinstance(row_dict['markdown'], Dict):
|
||||
row_dict['markdown_v2'] = row_dict['markdown']
|
||||
if row_dict['markdown'].get('raw_markdown'):
|
||||
row_dict['markdown'] = row_dict['markdown']['raw_markdown']
|
||||
|
||||
# Parse downloaded_files
|
||||
try:
|
||||
row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
|
||||
@@ -331,10 +336,28 @@ class AsyncDatabaseManager:
|
||||
content_map = {
|
||||
'html': (result.html, 'html'),
|
||||
'cleaned_html': (result.cleaned_html or "", 'cleaned'),
|
||||
'markdown': (result.markdown or "", 'markdown'),
|
||||
'markdown': None,
|
||||
'extracted_content': (result.extracted_content or "", 'extracted'),
|
||||
'screenshot': (result.screenshot or "", 'screenshots')
|
||||
}
|
||||
|
||||
try:
|
||||
if isinstance(result.markdown, MarkdownGenerationResult):
|
||||
content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
|
||||
elif hasattr(result, 'markdown_v2'):
|
||||
content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
|
||||
elif isinstance(result.markdown, str):
|
||||
markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
|
||||
content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
|
||||
else:
|
||||
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message=f"Error processing markdown content: {str(e)}",
|
||||
tag="WARNING"
|
||||
)
|
||||
# Fallback to empty markdown result
|
||||
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
||||
|
||||
content_hashes = {}
|
||||
for field, (content, content_type) in content_map.items():
|
||||
|
||||
@@ -69,6 +69,24 @@ class AsyncWebCrawler:
|
||||
New way (recommended):
|
||||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
|
||||
Attributes:
|
||||
browser_config (BrowserConfig): Configuration object for browser settings.
|
||||
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
||||
logger (AsyncLogger): Logger instance for recording events and errors.
|
||||
always_bypass_cache (bool): Whether to always bypass cache.
|
||||
crawl4ai_folder (str): Directory for storing cache.
|
||||
base_directory (str): Base directory for storing cache.
|
||||
ready (bool): Whether the crawler is ready for use.
|
||||
|
||||
Methods:
|
||||
start(): Start the crawler explicitly without using context manager.
|
||||
close(): Close the crawler explicitly without using context manager.
|
||||
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
awarmup(): Perform warmup sequence.
|
||||
arun_many(): Run the crawler for multiple sources.
|
||||
aprocess_html(): Process HTML content.
|
||||
"""
|
||||
_domain_last_hit = {}
|
||||
|
||||
@@ -321,7 +339,7 @@ class AsyncWebCrawler:
|
||||
|
||||
# Initialize processing variables
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached_result = None
|
||||
cached_result: CrawlResult = None
|
||||
screenshot_data = None
|
||||
pdf_data = None
|
||||
extracted_content = None
|
||||
@@ -373,52 +391,89 @@ class AsyncWebCrawler:
|
||||
tag="FETCH"
|
||||
)
|
||||
|
||||
# Process the HTML content
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
config=config, # Pass the config object instead of individual parameters
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html = True if url.startswith("raw:") else False,
|
||||
**kwargs
|
||||
)
|
||||
# Process the HTML content
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
config=config, # Pass the config object instead of individual parameters
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html = True if url.startswith("raw:") else False,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# crawl_result.status_code = async_response.status_code
|
||||
# crawl_result.response_headers = async_response.response_headers
|
||||
# crawl_result.downloaded_files = async_response.downloaded_files
|
||||
# crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
||||
# else:
|
||||
# crawl_result.status_code = 200
|
||||
# crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
||||
# crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
|
||||
|
||||
# # Check and set values from async_response to crawl_result
|
||||
try:
|
||||
for key in vars(async_response):
|
||||
if hasattr(crawl_result, key):
|
||||
value = getattr(async_response, key, None)
|
||||
current_value = getattr(crawl_result, key, None)
|
||||
if value is not None and not current_value:
|
||||
try:
|
||||
setattr(crawl_result, key, value)
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message=f"Failed to set attribute {key}: {str(e)}",
|
||||
tag="WARNING"
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message=f"Error copying response attributes: {str(e)}",
|
||||
tag="WARNING"
|
||||
)
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = getattr(config, 'session_id', None)
|
||||
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": crawl_result.success,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
# Update cache if appropriate
|
||||
if cache_context.should_write() and not bool(cached_result):
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
return crawl_result
|
||||
|
||||
# Set response data
|
||||
if async_response:
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
||||
else:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
||||
crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": True,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = getattr(config, 'session_id', None)
|
||||
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": crawl_result.success,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
# Update cache if appropriate
|
||||
if cache_context.should_write() and not bool(cached_result):
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
return crawl_result
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(config, 'session_id', None)
|
||||
return cached_result
|
||||
|
||||
except Exception as e:
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
@@ -465,6 +520,7 @@ class AsyncWebCrawler:
|
||||
extracted_content: Previously extracted content (if any)
|
||||
config: Configuration object controlling processing behavior
|
||||
screenshot: Screenshot data (if any)
|
||||
pdf_data: PDF data (if any)
|
||||
verbose: Whether to enable verbose logging
|
||||
**kwargs: Additional parameters for backwards compatibility
|
||||
|
||||
|
||||
@@ -25,8 +25,26 @@ class CacheContext:
|
||||
|
||||
This class centralizes all cache-related logic and URL type checking,
|
||||
making the caching behavior more predictable and maintainable.
|
||||
|
||||
Attributes:
|
||||
url (str): The URL being processed.
|
||||
cache_mode (CacheMode): The cache mode for the current operation.
|
||||
always_bypass (bool): If True, bypasses caching for this operation.
|
||||
is_cacheable (bool): True if the URL is cacheable, False otherwise.
|
||||
is_web_url (bool): True if the URL is a web URL, False otherwise.
|
||||
is_local_file (bool): True if the URL is a local file, False otherwise.
|
||||
is_raw_html (bool): True if the URL is raw HTML, False otherwise.
|
||||
_url_display (str): The display name for the URL (web, local file, or raw HTML).
|
||||
"""
|
||||
def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
|
||||
"""
|
||||
Initializes the CacheContext with the provided URL and cache mode.
|
||||
|
||||
Args:
|
||||
url (str): The URL being processed.
|
||||
cache_mode (CacheMode): The cache mode for the current operation.
|
||||
always_bypass (bool): If True, bypasses caching for this operation.
|
||||
"""
|
||||
self.url = url
|
||||
self.cache_mode = cache_mode
|
||||
self.always_bypass = always_bypass
|
||||
@@ -37,13 +55,31 @@ class CacheContext:
|
||||
self._url_display = url if not self.is_raw_html else "Raw HTML"
|
||||
|
||||
def should_read(self) -> bool:
|
||||
"""Determines if cache should be read based on context."""
|
||||
"""
|
||||
Determines if cache should be read based on context.
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED or READ_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be read, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
|
||||
|
||||
def should_write(self) -> bool:
|
||||
"""Determines if cache should be written based on context."""
|
||||
"""
|
||||
Determines if cache should be written based on context.
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED or WRITE_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be written, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
|
||||
|
||||
@@ -7,22 +7,43 @@ from .utils import *
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
"""
|
||||
Abstract base class for chunking strategies.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def chunk(self, text: str) -> list:
|
||||
"""
|
||||
Abstract method to chunk the given text.
|
||||
|
||||
Args:
|
||||
text (str): The text to chunk.
|
||||
|
||||
Returns:
|
||||
list: A list of chunks.
|
||||
"""
|
||||
pass
|
||||
|
||||
# Create an identity chunking strategy f(x) = [x]
|
||||
class IdentityChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that returns the input text as a single chunk.
|
||||
"""
|
||||
def chunk(self, text: str) -> list:
|
||||
return [text]
|
||||
|
||||
# Regex-based chunking
|
||||
class RegexChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text based on regular expression patterns.
|
||||
"""
|
||||
def __init__(self, patterns=None, **kwargs):
|
||||
"""
|
||||
Initialize the RegexChunking object.
|
||||
|
||||
Args:
|
||||
patterns (list): A list of regular expression patterns to split text.
|
||||
"""
|
||||
if patterns is None:
|
||||
patterns = [r'\n\n'] # Default split pattern
|
||||
self.patterns = patterns
|
||||
@@ -38,9 +59,15 @@ class RegexChunking(ChunkingStrategy):
|
||||
|
||||
# NLP-based sentence chunking
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize the NlpSentenceChunking object.
|
||||
"""
|
||||
load_nltk_punkt()
|
||||
pass
|
||||
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
# Improved regex for sentence splitting
|
||||
@@ -57,8 +84,21 @@ class NlpSentenceChunking(ChunkingStrategy):
|
||||
|
||||
# Topic-based segmentation using TextTiling
|
||||
class TopicSegmentationChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
|
||||
|
||||
How it works:
|
||||
1. Segment the text into topics using TextTilingTokenizer
|
||||
2. Extract keywords for each topic segment
|
||||
"""
|
||||
|
||||
def __init__(self, num_keywords=3, **kwargs):
|
||||
"""
|
||||
Initialize the TopicSegmentationChunking object.
|
||||
|
||||
Args:
|
||||
num_keywords (int): The number of keywords to extract for each topic segment.
|
||||
"""
|
||||
import nltk as nl
|
||||
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
||||
self.num_keywords = num_keywords
|
||||
@@ -88,6 +128,14 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
# Fixed-length word chunks
|
||||
class FixedLengthWordChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into fixed-length word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words
|
||||
2. Create chunks of fixed length
|
||||
3. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, chunk_size=100, **kwargs):
|
||||
"""
|
||||
Initialize the fixed-length word chunking strategy with the given chunk size.
|
||||
@@ -103,6 +151,14 @@ class FixedLengthWordChunking(ChunkingStrategy):
|
||||
|
||||
# Sliding window chunking
|
||||
class SlidingWindowChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into overlapping word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words
|
||||
2. Create chunks of fixed length
|
||||
3. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, window_size=100, step=50, **kwargs):
|
||||
"""
|
||||
Initialize the sliding window chunking strategy with the given window size and
|
||||
@@ -133,6 +189,15 @@ class SlidingWindowChunking(ChunkingStrategy):
|
||||
return chunks
|
||||
|
||||
class OverlappingWindowChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into overlapping word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words using whitespace
|
||||
2. Create chunks of fixed length equal to the window size
|
||||
3. Slide the window by the overlap size
|
||||
4. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, window_size=1000, overlap=100, **kwargs):
|
||||
"""
|
||||
Initialize the overlapping window chunking strategy with the given window size and
|
||||
|
||||
@@ -9,17 +9,8 @@ from .utils import clean_tokens
|
||||
from abc import ABC, abstractmethod
|
||||
import math
|
||||
from snowballstemmer import stemmer
|
||||
|
||||
|
||||
# import regex
|
||||
# def tokenize_text(text):
|
||||
# # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters
|
||||
# pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]'
|
||||
# return regex.findall(pattern, text)
|
||||
|
||||
# from nltk.stem import PorterStemmer
|
||||
# ps = PorterStemmer()
|
||||
class RelevantContentFilter(ABC):
|
||||
"""Abstract base class for content filtering strategies"""
|
||||
def __init__(self, user_query: str = None):
|
||||
self.user_query = user_query
|
||||
self.included_tags = {
|
||||
@@ -171,9 +162,8 @@ class RelevantContentFilter(ABC):
|
||||
chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
|
||||
def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
|
||||
"""Common method for extracting text chunks"""
|
||||
_text_cache = {}
|
||||
def fast_text(element: Tag) -> str:
|
||||
@@ -271,7 +261,38 @@ class RelevantContentFilter(ABC):
|
||||
return str(tag) # Fallback to original if anything fails
|
||||
|
||||
class BM25ContentFilter(RelevantContentFilter):
|
||||
"""
|
||||
Content filtering using BM25 algorithm with priority tag handling.
|
||||
|
||||
How it works:
|
||||
1. Extracts page metadata with fallbacks.
|
||||
2. Extracts text chunks from the body element.
|
||||
3. Tokenizes the corpus and query.
|
||||
4. Applies BM25 algorithm to calculate scores for each chunk.
|
||||
5. Filters out chunks below the threshold.
|
||||
6. Sorts chunks by score in descending order.
|
||||
7. Returns the top N chunks.
|
||||
|
||||
Attributes:
|
||||
user_query (str): User query for filtering (optional).
|
||||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||||
language (str): Language for stemming (default: 'english').
|
||||
|
||||
Methods:
|
||||
filter_content(self, html: str, min_word_threshold: int = None)
|
||||
"""
|
||||
def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
|
||||
"""
|
||||
Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
|
||||
|
||||
Note:
|
||||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||||
|
||||
Args:
|
||||
user_query (str): User query for filtering (optional).
|
||||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||||
language (str): Language for stemming (default: 'english').
|
||||
"""
|
||||
super().__init__(user_query=user_query)
|
||||
self.bm25_threshold = bm25_threshold
|
||||
self.priority_tags = {
|
||||
@@ -290,7 +311,20 @@ class BM25ContentFilter(RelevantContentFilter):
|
||||
self.stemmer = stemmer(language)
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""Implements content filtering using BM25 algorithm with priority tag handling"""
|
||||
"""
|
||||
Implements content filtering using BM25 algorithm with priority tag handling.
|
||||
|
||||
Note:
|
||||
This method implements the filtering logic for the BM25ContentFilter class.
|
||||
It takes HTML content as input and returns a list of filtered text chunks.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered text chunks.
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
@@ -357,15 +391,42 @@ class BM25ContentFilter(RelevantContentFilter):
|
||||
|
||||
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class PruningContentFilter(RelevantContentFilter):
|
||||
"""
|
||||
Content filtering using pruning algorithm with dynamic threshold.
|
||||
|
||||
How it works:
|
||||
1. Extracts page metadata with fallbacks.
|
||||
2. Extracts text chunks from the body element.
|
||||
3. Applies pruning algorithm to calculate scores for each chunk.
|
||||
4. Filters out chunks below the threshold.
|
||||
5. Sorts chunks by score in descending order.
|
||||
6. Returns the top N chunks.
|
||||
|
||||
Attributes:
|
||||
user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||||
threshold (float): Fixed threshold value (default: 0.48).
|
||||
|
||||
Methods:
|
||||
filter_content(self, html: str, min_word_threshold: int = None):
|
||||
"""
|
||||
def __init__(self, user_query: str = None, min_word_threshold: int = None,
|
||||
threshold_type: str = 'fixed', threshold: float = 0.48):
|
||||
super().__init__(user_query)
|
||||
"""
|
||||
Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
|
||||
|
||||
Note:
|
||||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||||
|
||||
Args:
|
||||
user_query (str): User query for filtering (optional).
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||||
threshold (float): Fixed threshold value (default: 0.48).
|
||||
"""
|
||||
super().__init__(None)
|
||||
self.min_word_threshold = min_word_threshold
|
||||
self.threshold_type = threshold_type
|
||||
self.threshold = threshold
|
||||
@@ -418,6 +479,20 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
}
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""
|
||||
Implements content filtering using pruning algorithm with dynamic threshold.
|
||||
|
||||
Note:
|
||||
This method implements the filtering logic for the PruningContentFilter class.
|
||||
It takes HTML content as input and returns a list of filtered text chunks.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered text chunks.
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
@@ -444,15 +519,23 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
return content_blocks
|
||||
|
||||
def _remove_comments(self, soup):
|
||||
"""Removes HTML comments"""
|
||||
for element in soup(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
|
||||
def _remove_unwanted_tags(self, soup):
|
||||
"""Removes unwanted tags"""
|
||||
for tag in self.excluded_tags:
|
||||
for element in soup.find_all(tag):
|
||||
element.decompose()
|
||||
|
||||
def _prune_tree(self, node):
|
||||
"""
|
||||
Prunes the tree starting from the given node.
|
||||
|
||||
Args:
|
||||
node (Tag): The node from which the pruning starts.
|
||||
"""
|
||||
if not node or not hasattr(node, 'name') or node.name is None:
|
||||
return
|
||||
|
||||
@@ -495,6 +578,7 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
self._prune_tree(child)
|
||||
|
||||
def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
|
||||
"""Computes the composite score"""
|
||||
if self.min_word_threshold:
|
||||
# Get raw text from metrics node - avoid extra processing
|
||||
text = metrics['node'].get_text(strip=True)
|
||||
@@ -531,6 +615,7 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
return score / total_weight if total_weight > 0 else 0
|
||||
|
||||
def _compute_class_id_weight(self, node):
|
||||
"""Computes the class ID weight"""
|
||||
class_id_score = 0
|
||||
if 'class' in node.attrs:
|
||||
classes = ' '.join(node['class'])
|
||||
|
||||
@@ -64,6 +64,17 @@ class ContentScrapingStrategy(ABC):
|
||||
pass
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
Class for web content scraping. Perhaps the most important class.
|
||||
|
||||
How it works:
|
||||
1. Extract content from HTML using BeautifulSoup.
|
||||
2. Clean the extracted content using a content cleaning strategy.
|
||||
3. Filter the cleaned content using a content filtering strategy.
|
||||
4. Generate markdown content from the filtered content.
|
||||
5. Return the markdown content.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
self.logger = logger
|
||||
|
||||
@@ -74,17 +85,57 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
log_method(message=message, tag=tag, **kwargs)
|
||||
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Main entry point for content scraping.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
"""
|
||||
return self._scrap(url, html, is_async=False, **kwargs)
|
||||
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Main entry point for asynchronous content scraping.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
|
||||
def _generate_markdown_content(self,
|
||||
cleaned_html: str,
|
||||
html: str,
|
||||
url: str,
|
||||
success: bool,
|
||||
**kwargs) -> Dict[str, Any]:
|
||||
def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate markdown content from cleaned HTML.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): The cleaned HTML content.
|
||||
html (str): The original HTML content.
|
||||
url (str): The URL of the page.
|
||||
success (bool): Whether the content was successfully cleaned.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the generated markdown content.
|
||||
"""
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
|
||||
|
||||
if markdown_generator:
|
||||
@@ -158,6 +209,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
|
||||
Args:
|
||||
node (Tag): The root node of the HTML tree.
|
||||
|
||||
Returns:
|
||||
Tag: The flattened HTML tree.
|
||||
"""
|
||||
if isinstance(node, NavigableString):
|
||||
return node
|
||||
if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
|
||||
@@ -166,6 +226,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return node
|
||||
|
||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||
"""
|
||||
Find the closest parent with useful text.
|
||||
|
||||
Args:
|
||||
tag (Tag): The starting tag to search from.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Tag: The closest parent with useful text, or None if not found.
|
||||
"""
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
@@ -179,6 +249,17 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return None
|
||||
|
||||
def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
|
||||
"""
|
||||
Remove unwanted attributes from an HTML element.
|
||||
|
||||
Args:
|
||||
element (Tag): The HTML element to remove attributes from.
|
||||
important_attrs (list): List of important attributes to keep.
|
||||
keep_data_attributes (bool): Whether to keep data attributes.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
@@ -192,6 +273,26 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
del element[attr]
|
||||
|
||||
def process_image(self, img, url, index, total_images, **kwargs):
|
||||
"""
|
||||
Process an image element.
|
||||
|
||||
How it works:
|
||||
1. Check if the image has valid display and inside undesired html elements.
|
||||
2. Score an image for it's usefulness.
|
||||
3. Extract image file metadata to extract size and extension.
|
||||
4. Generate a dictionary with the processed image information.
|
||||
5. Return the processed image information.
|
||||
|
||||
Args:
|
||||
img (Tag): The image element to process.
|
||||
url (str): The URL of the page containing the image.
|
||||
index (int): The index of the image in the list of images.
|
||||
total_images (int): The total number of images in the list.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed image information.
|
||||
"""
|
||||
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
if ' ' in u else None}
|
||||
for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
@@ -316,6 +417,23 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Process an HTML element.
|
||||
|
||||
How it works:
|
||||
1. Check if the element is an image, video, or audio.
|
||||
2. Extract the element's attributes and content.
|
||||
3. Process the element based on its type.
|
||||
4. Return the processed element information.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page containing the element.
|
||||
element (Tag): The HTML element to process.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed element information.
|
||||
"""
|
||||
media = {'images': [], 'videos': [], 'audios': []}
|
||||
internal_links_dict = {}
|
||||
external_links_dict = {}
|
||||
@@ -334,6 +452,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
}
|
||||
|
||||
def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
|
||||
"""
|
||||
Process an HTML element.
|
||||
"""
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
@@ -534,11 +655,25 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return False
|
||||
|
||||
def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from HTML using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page to scrape.
|
||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
||||
css_selector (str): The CSS selector to use for content extraction.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted content.
|
||||
"""
|
||||
success = True
|
||||
if not html:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
parser_type = kwargs.get('parser', 'lxml')
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
|
||||
1440
crawl4ai/extraction_strategy.bak.py
Normal file
1440
crawl4ai/extraction_strategy.bak.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -62,29 +62,66 @@ class ExtractionStrategy(ABC):
|
||||
return extracted_content
|
||||
|
||||
class NoExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
|
||||
"""
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract meaningful blocks or chunks from the given HTML.
|
||||
"""
|
||||
return [{"index": 0, "content": html}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
||||
|
||||
|
||||
#######################################################
|
||||
# Strategies using LLM-based extraction for text data #
|
||||
#######################################################
|
||||
|
||||
|
||||
|
||||
class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
A strategy that uses an LLM to extract meaningful content from the HTML.
|
||||
|
||||
Attributes:
|
||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||
api_token: The API token for the provider.
|
||||
instruction: The instruction to use for the LLM model.
|
||||
schema: Pydantic model schema for structured data.
|
||||
extraction_type: "block" or "schema".
|
||||
chunk_token_threshold: Maximum tokens per chunk.
|
||||
overlap_rate: Overlap between chunks.
|
||||
word_token_rate: Word to token conversion rate.
|
||||
apply_chunking: Whether to apply chunking.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
verbose: Whether to print verbose output.
|
||||
usages: List of individual token usages.
|
||||
total_usage: Accumulated token usage.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
|
||||
instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
|
||||
Args:
|
||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||
api_token: The API token for the provider.
|
||||
instruction: The instruction to use for the LLM model.
|
||||
schema: Pydantic model schema for structured data.
|
||||
extraction_type: "block" or "schema".
|
||||
chunk_token_threshold: Maximum tokens per chunk.
|
||||
overlap_rate: Overlap between chunks.
|
||||
word_token_rate: Word to token conversion rate.
|
||||
apply_chunking: Whether to apply chunking.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
verbose: Whether to print verbose output.
|
||||
usages: List of individual token usages.
|
||||
total_usage: Accumulated token usage.
|
||||
|
||||
:param provider: The provider to use for extraction.
|
||||
:param api_token: The API token for the provider.
|
||||
:param instruction: The instruction to use for the LLM model.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.provider = provider
|
||||
@@ -114,6 +151,22 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
|
||||
|
||||
def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract meaningful blocks or chunks from the given HTML using an LLM.
|
||||
|
||||
How it works:
|
||||
1. Construct a prompt with variables.
|
||||
2. Make a request to the LLM using the prompt.
|
||||
3. Parse the response and extract blocks or chunks.
|
||||
|
||||
Args:
|
||||
url: The URL of the webpage.
|
||||
ix: Index of the block.
|
||||
html: The HTML content of the webpage.
|
||||
|
||||
Returns:
|
||||
A list of extracted blocks or chunks.
|
||||
"""
|
||||
if self.verbose:
|
||||
# print("[LOG] Extracting blocks from URL:", url)
|
||||
print(f"[LOG] Call LLM for {url} - block index: {ix}")
|
||||
@@ -180,6 +233,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
return blocks
|
||||
|
||||
def _merge(self, documents, chunk_token_threshold, overlap):
|
||||
"""
|
||||
Merge documents into sections based on chunk_token_threshold and overlap.
|
||||
"""
|
||||
chunks = []
|
||||
sections = []
|
||||
total_tokens = 0
|
||||
@@ -229,6 +285,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
|
||||
|
||||
Args:
|
||||
url: The URL of the webpage.
|
||||
sections: List of sections (strings) to process.
|
||||
|
||||
Returns:
|
||||
A list of extracted blocks or chunks.
|
||||
"""
|
||||
|
||||
merged_sections = self._merge(
|
||||
@@ -285,12 +348,30 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
for i, usage in enumerate(self.usages, 1):
|
||||
print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
|
||||
|
||||
|
||||
#######################################################
|
||||
# Strategies using clustering for text data extraction #
|
||||
#######################################################
|
||||
|
||||
class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Extract meaningful blocks or chunks from the given HTML using cosine similarity.
|
||||
|
||||
How it works:
|
||||
1. Pre-filter documents using embeddings and semantic_filter.
|
||||
2. Perform clustering using cosine similarity.
|
||||
3. Organize texts by their cluster labels, retaining order.
|
||||
4. Filter clusters by word count.
|
||||
5. Extract meaningful blocks or chunks from the filtered clusters.
|
||||
|
||||
Attributes:
|
||||
semantic_filter (str): A keyword filter for document filtering.
|
||||
word_count_threshold (int): Minimum number of words per cluster.
|
||||
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
||||
linkage_method (str): The linkage method for hierarchical clustering.
|
||||
top_k (int): Number of top categories to extract.
|
||||
model_name (str): The name of the sentence-transformers model.
|
||||
sim_threshold (float): The similarity threshold for clustering.
|
||||
"""
|
||||
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
@@ -368,11 +449,13 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||
|
||||
:param documents: List of text chunks (documents).
|
||||
:param semantic_filter: A string containing the keywords for filtering.
|
||||
:param threshold: Cosine similarity threshold for filtering documents.
|
||||
:param at_least_k: Minimum number of documents to return.
|
||||
:return: List of filtered documents, ensuring at least `at_least_k` documents.
|
||||
Args:
|
||||
documents (List[str]): A list of document texts.
|
||||
semantic_filter (str): A keyword filter for document filtering.
|
||||
at_least_k (int): The minimum number of documents to return.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of filtered and sorted document texts.
|
||||
"""
|
||||
|
||||
if not semantic_filter:
|
||||
@@ -410,8 +493,11 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Get BERT embeddings for a list of sentences.
|
||||
|
||||
:param sentences: List of text chunks (sentences).
|
||||
:return: NumPy array of embeddings.
|
||||
Args:
|
||||
sentences (List[str]): A list of text chunks (sentences).
|
||||
|
||||
Returns:
|
||||
NumPy array of embeddings.
|
||||
"""
|
||||
# if self.buffer_embeddings.any() and not bypass_buffer:
|
||||
# return self.buffer_embeddings
|
||||
@@ -455,8 +541,11 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Perform hierarchical clustering on sentences and return cluster labels.
|
||||
|
||||
:param sentences: List of text chunks (sentences).
|
||||
:return: NumPy array of cluster labels.
|
||||
Args:
|
||||
sentences (List[str]): A list of text chunks (sentences).
|
||||
|
||||
Returns:
|
||||
NumPy array of cluster labels.
|
||||
"""
|
||||
# Get embeddings
|
||||
from scipy.cluster.hierarchy import linkage, fcluster
|
||||
@@ -472,12 +561,15 @@ class CosineStrategy(ExtractionStrategy):
|
||||
labels = fcluster(linked, self.max_dist, criterion='distance')
|
||||
return labels
|
||||
|
||||
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]):
|
||||
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
|
||||
"""
|
||||
Filter clusters to remove those with a word count below the threshold.
|
||||
|
||||
:param clusters: Dictionary of clusters.
|
||||
:return: Filtered dictionary of clusters.
|
||||
Args:
|
||||
clusters (Dict[int, List[str]]): Dictionary of clusters.
|
||||
|
||||
Returns:
|
||||
Dict[int, List[str]]: Filtered dictionary of clusters.
|
||||
"""
|
||||
filtered_clusters = {}
|
||||
for cluster_id, texts in clusters.items():
|
||||
@@ -496,9 +588,12 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Extract clusters from HTML content using hierarchical clustering.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param html: The HTML content of the webpage.
|
||||
:return: A list of dictionaries representing the clusters.
|
||||
Args:
|
||||
url (str): The URL of the webpage.
|
||||
html (str): The HTML content of the webpage.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of processed JSON blocks.
|
||||
"""
|
||||
# Assume `html` is a list of text chunks for this strategy
|
||||
t = time.time()
|
||||
@@ -560,159 +655,85 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Process sections using hierarchical clustering.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to process.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of processed JSON blocks.
|
||||
Args:
|
||||
url (str): The URL of the webpage.
|
||||
sections (List[str]): List of sections (strings) to process.
|
||||
|
||||
Returns:
|
||||
"""
|
||||
# This strategy processes all sections together
|
||||
|
||||
return self.extract(url, self.DEL.join(sections), **kwargs)
|
||||
|
||||
|
||||
#######################################################
|
||||
# Strategies based on the extraction of specific types #
|
||||
#######################################################
|
||||
|
||||
class TopicExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, num_keywords: int = 3, **kwargs):
|
||||
"""
|
||||
Initialize the topic extraction strategy with parameters for topic segmentation.
|
||||
|
||||
:param num_keywords: Number of keywords to represent each topic segment.
|
||||
"""
|
||||
import nltk
|
||||
super().__init__(**kwargs)
|
||||
self.num_keywords = num_keywords
|
||||
self.tokenizer = nltk.TextTilingTokenizer()
|
||||
|
||||
def extract_keywords(self, text: str) -> List[str]:
|
||||
"""
|
||||
Extract keywords from a given text segment using simple frequency analysis.
|
||||
|
||||
:param text: The text segment from which to extract keywords.
|
||||
:return: A list of keyword strings.
|
||||
"""
|
||||
import nltk
|
||||
# Tokenize the text and compute word frequency
|
||||
words = nltk.word_tokenize(text)
|
||||
freq_dist = nltk.FreqDist(words)
|
||||
# Get the most common words as keywords
|
||||
keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
|
||||
return keywords
|
||||
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param html: The HTML content of the webpage.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of dictionaries representing the topics.
|
||||
"""
|
||||
# Use TextTiling to segment the text into topics
|
||||
segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed
|
||||
|
||||
# Prepare the output as a list of dictionaries
|
||||
topic_list = []
|
||||
for i, segment in enumerate(segmented_topics):
|
||||
# Extract keywords for each segment
|
||||
keywords = self.extract_keywords(segment)
|
||||
topic_list.append({
|
||||
"index": i,
|
||||
"content": segment,
|
||||
"keywords": keywords
|
||||
})
|
||||
|
||||
return topic_list
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections using topic segmentation and keyword extraction.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to process.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of processed JSON blocks.
|
||||
"""
|
||||
# Concatenate sections into a single text for coherent topic segmentation
|
||||
|
||||
|
||||
return self.extract(url, self.DEL.join(sections), **kwargs)
|
||||
|
||||
class ContentSummarizationStrategy(ExtractionStrategy):
|
||||
def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
|
||||
"""
|
||||
Initialize the content summarization strategy with a specific model.
|
||||
|
||||
:param model_name: The model to use for summarization.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
from transformers import pipeline
|
||||
self.summarizer = pipeline("summarization", model=model_name)
|
||||
|
||||
def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Summarize a single section of text.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param text: A section of text to summarize.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A dictionary with the summary.
|
||||
"""
|
||||
try:
|
||||
summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
|
||||
return {"summary": summary[0]['summary_text']}
|
||||
except Exception as e:
|
||||
print(f"Error summarizing text: {e}")
|
||||
return {"summary": text} # Fallback to original text if summarization fails
|
||||
|
||||
def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process each section in parallel to produce summaries.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to summarize.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of dictionaries with summaries for each section.
|
||||
"""
|
||||
# Use a ThreadPoolExecutor to summarize in parallel
|
||||
summaries = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
# Create a future for each section's summarization
|
||||
future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
|
||||
for future in as_completed(future_to_section):
|
||||
section_index = future_to_section[future]
|
||||
try:
|
||||
summary_result = future.result()
|
||||
summaries.append((section_index, summary_result))
|
||||
except Exception as e:
|
||||
print(f"Error processing section {section_index}: {e}")
|
||||
summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text
|
||||
|
||||
# Sort summaries by the original section index to maintain order
|
||||
summaries.sort(key=lambda x: x[0])
|
||||
return [summary for _, summary in summaries]
|
||||
|
||||
|
||||
#######################################################
|
||||
# New extraction strategies for JSON-based extraction #
|
||||
#######################################################
|
||||
|
||||
class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Abstract base class for extracting structured JSON from HTML content.
|
||||
|
||||
How it works:
|
||||
1. Parses HTML content using the `_parse_html` method.
|
||||
2. Uses a schema to define base selectors, fields, and transformations.
|
||||
3. Extracts data hierarchically, supporting nested fields and lists.
|
||||
4. Handles computed fields with expressions or functions.
|
||||
|
||||
Attributes:
|
||||
DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
verbose (bool): Enables verbose logging for debugging purposes.
|
||||
|
||||
Methods:
|
||||
extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
|
||||
_extract_item(element, fields): Extracts fields from a single element.
|
||||
_extract_single_field(element, field): Extracts a single field based on its type.
|
||||
_apply_transform(value, transform): Applies a transformation to a value.
|
||||
_compute_field(item, field): Computes a field value using an expression or function.
|
||||
run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
|
||||
|
||||
Abstract Methods:
|
||||
_parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
|
||||
_get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
|
||||
_get_elements(element, selector): Retrieves child elements using a selector.
|
||||
_get_element_text(element): Extracts text content from an element.
|
||||
_get_element_html(element): Extracts raw HTML from an element.
|
||||
_get_element_attribute(element, attribute): Extracts an attribute's value from an element.
|
||||
"""
|
||||
|
||||
|
||||
DEL = '\n'
|
||||
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
"""
|
||||
Initialize the JSON element extraction strategy with a schema.
|
||||
|
||||
Args:
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.schema = schema
|
||||
self.verbose = kwargs.get('verbose', False)
|
||||
|
||||
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract structured data from HTML content.
|
||||
|
||||
How it works:
|
||||
1. Parses the HTML content using the `_parse_html` method.
|
||||
2. Identifies base elements using the schema's base selector.
|
||||
3. Extracts fields from each base element using `_extract_item`.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page being processed.
|
||||
html_content (str): The raw HTML content to parse and extract.
|
||||
*q: Additional positional arguments.
|
||||
**kwargs: Additional keyword arguments for custom extraction.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
|
||||
"""
|
||||
|
||||
parsed_html = self._parse_html(html_content)
|
||||
base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
|
||||
|
||||
@@ -772,6 +793,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return field.get('default')
|
||||
|
||||
def _extract_single_field(self, element, field):
|
||||
"""
|
||||
Extract a single field based on its type.
|
||||
|
||||
How it works:
|
||||
1. Selects the target element using the field's selector.
|
||||
2. Extracts the field value based on its type (e.g., text, attribute, regex).
|
||||
3. Applies transformations if defined in the schema.
|
||||
|
||||
Args:
|
||||
element: The base element to extract the field from.
|
||||
field (Dict[str, Any]): The field definition in the schema.
|
||||
|
||||
Returns:
|
||||
Any: The extracted field value.
|
||||
"""
|
||||
|
||||
if 'selector' in field:
|
||||
selected = self._get_elements(element, field['selector'])
|
||||
if not selected:
|
||||
@@ -806,6 +843,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return item
|
||||
|
||||
def _extract_item(self, element, fields):
|
||||
"""
|
||||
Extracts fields from a given element.
|
||||
|
||||
How it works:
|
||||
1. Iterates through the fields defined in the schema.
|
||||
2. Handles computed, single, and nested field types.
|
||||
3. Updates the item dictionary with extracted field values.
|
||||
|
||||
Args:
|
||||
element: The base element to extract fields from.
|
||||
fields (List[Dict[str, Any]]): The list of fields to extract.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary representing the extracted item.
|
||||
"""
|
||||
|
||||
item = {}
|
||||
for field in fields:
|
||||
if field['type'] == 'computed':
|
||||
@@ -817,6 +870,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return item
|
||||
|
||||
def _apply_transform(self, value, transform):
|
||||
"""
|
||||
Apply a transformation to a value.
|
||||
|
||||
How it works:
|
||||
1. Checks the transformation type (e.g., `lowercase`, `strip`).
|
||||
2. Applies the transformation to the value.
|
||||
3. Returns the transformed value.
|
||||
|
||||
Args:
|
||||
value (str): The value to transform.
|
||||
transform (str): The type of transformation to apply.
|
||||
|
||||
Returns:
|
||||
str: The transformed value.
|
||||
"""
|
||||
|
||||
if transform == 'lowercase':
|
||||
return value.lower()
|
||||
elif transform == 'uppercase':
|
||||
@@ -837,6 +906,23 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return field.get('default')
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Run the extraction strategy on a combined HTML content.
|
||||
|
||||
How it works:
|
||||
1. Combines multiple HTML sections using the `DEL` delimiter.
|
||||
2. Calls the `extract` method with the combined HTML.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page being processed.
|
||||
sections (List[str]): A list of HTML sections.
|
||||
*q: Additional positional arguments.
|
||||
**kwargs: Additional keyword arguments for custom extraction.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of extracted items.
|
||||
"""
|
||||
|
||||
combined_html = self.DEL.join(sections)
|
||||
return self.extract(url, combined_html, **kwargs)
|
||||
|
||||
@@ -856,6 +942,27 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
pass
|
||||
|
||||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
||||
|
||||
How it works:
|
||||
1. Parses HTML content with BeautifulSoup.
|
||||
2. Selects elements using CSS selectors defined in the schema.
|
||||
3. Extracts field data and applies transformations as defined.
|
||||
|
||||
Attributes:
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
verbose (bool): Enables verbose logging for debugging purposes.
|
||||
|
||||
Methods:
|
||||
_parse_html(html_content): Parses HTML content into a BeautifulSoup object.
|
||||
_get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
|
||||
_get_elements(element, selector): Selects child elements using a CSS selector.
|
||||
_get_element_text(element): Extracts text content from a BeautifulSoup element.
|
||||
_get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
|
||||
_get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
|
||||
"""
|
||||
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(schema, **kwargs)
|
||||
@@ -880,6 +987,28 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
return element.get(attribute)
|
||||
|
||||
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
|
||||
|
||||
How it works:
|
||||
1. Parses HTML content into an lxml tree.
|
||||
2. Selects elements using XPath expressions.
|
||||
3. Converts CSS selectors to XPath when needed.
|
||||
|
||||
Attributes:
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
verbose (bool): Enables verbose logging for debugging purposes.
|
||||
|
||||
Methods:
|
||||
_parse_html(html_content): Parses HTML content into an lxml tree.
|
||||
_get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
|
||||
_css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
|
||||
_get_elements(element, selector): Selects child elements using an XPath selector.
|
||||
_get_element_text(element): Extracts text content from an lxml element.
|
||||
_get_element_html(element): Extracts the raw HTML content of an lxml element.
|
||||
_get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
|
||||
"""
|
||||
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(schema, **kwargs)
|
||||
@@ -921,259 +1050,3 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||||
def _get_element_attribute(self, element, attribute: str):
|
||||
return element.get(attribute)
|
||||
|
||||
|
||||
class _JsonCssExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(**kwargs)
|
||||
self.schema = schema
|
||||
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
base_elements = soup.select(self.schema['baseSelector'])
|
||||
|
||||
results = []
|
||||
for element in base_elements:
|
||||
# Extract base element attributes first
|
||||
item = {}
|
||||
if 'baseFields' in self.schema:
|
||||
for field in self.schema['baseFields']:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
|
||||
# Then extract child fields
|
||||
field_data = self._extract_item(element, self.schema['fields'])
|
||||
item.update(field_data)
|
||||
|
||||
results.append(item)
|
||||
|
||||
return results
|
||||
|
||||
def _extract_field(self, element, field):
|
||||
try:
|
||||
if field['type'] == 'nested':
|
||||
nested_element = element.select_one(field['selector'])
|
||||
return self._extract_item(nested_element, field['fields']) if nested_element else {}
|
||||
|
||||
if field['type'] == 'list':
|
||||
elements = element.select(field['selector'])
|
||||
return [self._extract_list_item(el, field['fields']) for el in elements]
|
||||
|
||||
if field['type'] == 'nested_list':
|
||||
elements = element.select(field['selector'])
|
||||
return [self._extract_item(el, field['fields']) for el in elements]
|
||||
|
||||
return self._extract_single_field(element, field)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error extracting field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def _extract_list_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _extract_single_field(self, element, field):
|
||||
if 'selector' in field:
|
||||
selected = element.select_one(field['selector'])
|
||||
if not selected:
|
||||
return field.get('default')
|
||||
else:
|
||||
selected = element
|
||||
|
||||
value = None
|
||||
if field['type'] == 'text':
|
||||
value = selected.get_text(strip=True)
|
||||
elif field['type'] == 'attribute':
|
||||
value = selected.get(field['attribute'])
|
||||
elif field['type'] == 'html':
|
||||
value = str(selected)
|
||||
elif field['type'] == 'regex':
|
||||
text = selected.get_text(strip=True)
|
||||
match = re.search(field['pattern'], text)
|
||||
value = match.group(1) if match else None
|
||||
|
||||
if 'transform' in field:
|
||||
value = self._apply_transform(value, field['transform'])
|
||||
|
||||
return value if value is not None else field.get('default')
|
||||
|
||||
def _extract_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
if field['type'] == 'computed':
|
||||
value = self._compute_field(item, field)
|
||||
else:
|
||||
value = self._extract_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _apply_transform(self, value, transform):
|
||||
if transform == 'lowercase':
|
||||
return value.lower()
|
||||
elif transform == 'uppercase':
|
||||
return value.upper()
|
||||
elif transform == 'strip':
|
||||
return value.strip()
|
||||
return value
|
||||
|
||||
def _compute_field(self, item, field):
|
||||
try:
|
||||
if 'expression' in field:
|
||||
return eval(field['expression'], {}, item)
|
||||
elif 'function' in field:
|
||||
return field['function'](item)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error computing field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
combined_html = self.DEL.join(sections)
|
||||
return self.extract(url, combined_html, **kwargs)
|
||||
class _JsonXPathExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(**kwargs)
|
||||
self.schema = schema
|
||||
|
||||
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
tree = html.fromstring(html_content)
|
||||
base_xpath = self.schema['baseSelector']
|
||||
base_elements = tree.xpath(base_xpath)
|
||||
|
||||
results = []
|
||||
for element in base_elements:
|
||||
# Extract base element attributes first
|
||||
item = {}
|
||||
if 'baseFields' in self.schema:
|
||||
for field in self.schema['baseFields']:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
|
||||
# Then extract child fields
|
||||
field_data = self._extract_item(element, self.schema['fields'])
|
||||
item.update(field_data)
|
||||
|
||||
results.append(item)
|
||||
|
||||
return results
|
||||
|
||||
def _css_to_xpath(self, css_selector: str) -> str:
|
||||
"""Convert CSS selector to XPath if needed"""
|
||||
if '/' in css_selector: # Already an XPath
|
||||
return css_selector
|
||||
else:
|
||||
# Fallback to basic conversion for common cases
|
||||
return self._basic_css_to_xpath(css_selector)
|
||||
|
||||
def _basic_css_to_xpath(self, css_selector: str) -> str:
|
||||
"""Basic CSS to XPath conversion for common cases"""
|
||||
# Handle basic cases
|
||||
if ' > ' in css_selector:
|
||||
parts = css_selector.split(' > ')
|
||||
return '//' + '/'.join(parts)
|
||||
if ' ' in css_selector:
|
||||
parts = css_selector.split(' ')
|
||||
return '//' + '//'.join(parts)
|
||||
return '//' + css_selector
|
||||
|
||||
def _extract_field(self, element, field):
|
||||
try:
|
||||
if field['type'] == 'nested':
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
|
||||
return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
|
||||
|
||||
if field['type'] == 'list':
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
elements = element.xpath(xpath)
|
||||
return [self._extract_list_item(el, field['fields']) for el in elements]
|
||||
|
||||
if field['type'] == 'nested_list':
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
elements = element.xpath(xpath)
|
||||
return [self._extract_item(el, field['fields']) for el in elements]
|
||||
|
||||
return self._extract_single_field(element, field)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error extracting field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def _extract_list_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _extract_single_field(self, element, field):
|
||||
if 'selector' in field:
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
selected = element.xpath(xpath)
|
||||
if not selected:
|
||||
return field.get('default')
|
||||
selected = selected[0]
|
||||
else:
|
||||
selected = element
|
||||
|
||||
value = None
|
||||
if field['type'] == 'text':
|
||||
value = ''.join(selected.xpath('.//text()')).strip()
|
||||
elif field['type'] == 'attribute':
|
||||
value = selected.get(field['attribute'])
|
||||
elif field['type'] == 'html':
|
||||
value = etree.tostring(selected, encoding='unicode')
|
||||
elif field['type'] == 'regex':
|
||||
text = ''.join(selected.xpath('.//text()')).strip()
|
||||
match = re.search(field['pattern'], text)
|
||||
value = match.group(1) if match else None
|
||||
|
||||
if 'transform' in field:
|
||||
value = self._apply_transform(value, field['transform'])
|
||||
|
||||
return value if value is not None else field.get('default')
|
||||
|
||||
def _extract_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
if field['type'] == 'computed':
|
||||
value = self._compute_field(item, field)
|
||||
else:
|
||||
value = self._extract_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _apply_transform(self, value, transform):
|
||||
if transform == 'lowercase':
|
||||
return value.lower()
|
||||
elif transform == 'uppercase':
|
||||
return value.upper()
|
||||
elif transform == 'strip':
|
||||
return value.strip()
|
||||
return value
|
||||
|
||||
def _compute_field(self, item, field):
|
||||
try:
|
||||
if 'expression' in field:
|
||||
return eval(field['expression'], {}, item)
|
||||
elif 'function' in field:
|
||||
return field['function'](item)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error computing field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
combined_html = self.DEL.join(sections)
|
||||
return self.extract(url, combined_html, **kwargs)
|
||||
|
||||
@@ -38,11 +38,44 @@ class MarkdownGenerationStrategy(ABC):
|
||||
pass
|
||||
|
||||
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
"""Default implementation of markdown generation strategy."""
|
||||
"""
|
||||
Default implementation of markdown generation strategy.
|
||||
|
||||
How it works:
|
||||
1. Generate raw markdown from cleaned HTML.
|
||||
2. Convert links to citations.
|
||||
3. Generate fit markdown if content filter is provided.
|
||||
4. Return MarkdownGenerationResult.
|
||||
|
||||
Args:
|
||||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
|
||||
|
||||
Returns:
|
||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||
"""
|
||||
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(content_filter, options)
|
||||
|
||||
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
|
||||
"""
|
||||
Convert links in markdown to citations.
|
||||
|
||||
How it works:
|
||||
1. Find all links in the markdown.
|
||||
2. Convert links to citations.
|
||||
3. Return converted markdown and references markdown.
|
||||
|
||||
Note:
|
||||
This function uses a regex pattern to find links in markdown.
|
||||
|
||||
Args:
|
||||
markdown (str): Markdown text.
|
||||
base_url (str): Base URL for URL joins.
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: Converted markdown and references markdown.
|
||||
"""
|
||||
link_map = {}
|
||||
url_cache = {} # Cache for URL joins
|
||||
parts = []
|
||||
@@ -90,7 +123,26 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True,
|
||||
**kwargs) -> MarkdownGenerationResult:
|
||||
"""Generate markdown with citations from cleaned HTML."""
|
||||
"""
|
||||
Generate markdown with citations from cleaned HTML.
|
||||
|
||||
How it works:
|
||||
1. Generate raw markdown from cleaned HTML.
|
||||
2. Convert links to citations.
|
||||
3. Generate fit markdown if content filter is provided.
|
||||
4. Return MarkdownGenerationResult.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): Cleaned HTML content.
|
||||
base_url (str): Base URL for URL joins.
|
||||
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
|
||||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||||
citations (bool): Whether to generate citations.
|
||||
|
||||
Returns:
|
||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||
"""
|
||||
# Initialize HTML2Text with options
|
||||
h = CustomHTML2Text()
|
||||
if html2text_options:
|
||||
|
||||
@@ -13,13 +13,34 @@ from pathlib import Path
|
||||
class SSLCertificate:
|
||||
"""
|
||||
A class representing an SSL certificate with methods to export in various formats.
|
||||
|
||||
Attributes:
|
||||
cert_info (Dict[str, Any]): The certificate information.
|
||||
|
||||
Methods:
|
||||
from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
|
||||
from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
|
||||
from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
|
||||
export_as_pem() -> str: Export the certificate as PEM format.
|
||||
export_as_der() -> bytes: Export the certificate as DER format.
|
||||
export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
|
||||
export_as_text() -> str: Export the certificate as text format.
|
||||
"""
|
||||
def __init__(self, cert_info: Dict[str, Any]):
|
||||
self._cert_info = self._decode_cert_data(cert_info)
|
||||
|
||||
@staticmethod
|
||||
def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
|
||||
"""Create SSLCertificate instance from a URL."""
|
||||
"""
|
||||
Create SSLCertificate instance from a URL.
|
||||
|
||||
Args:
|
||||
url (str): URL of the website.
|
||||
timeout (int): Timeout for the connection (default: 10).
|
||||
|
||||
Returns:
|
||||
Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
hostname = urlparse(url).netloc
|
||||
if ':' in hostname:
|
||||
@@ -73,7 +94,15 @@ class SSLCertificate:
|
||||
return data
|
||||
|
||||
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""Export certificate as JSON."""
|
||||
"""
|
||||
Export certificate as JSON.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the JSON file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: JSON string if successful, None otherwise.
|
||||
"""
|
||||
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
|
||||
if filepath:
|
||||
Path(filepath).write_text(json_str, encoding='utf-8')
|
||||
@@ -81,7 +110,15 @@ class SSLCertificate:
|
||||
return json_str
|
||||
|
||||
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""Export certificate as PEM."""
|
||||
"""
|
||||
Export certificate as PEM.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the PEM file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: PEM string if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1,
|
||||
@@ -100,7 +137,15 @@ class SSLCertificate:
|
||||
return None
|
||||
|
||||
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
||||
"""Export certificate as DER."""
|
||||
"""
|
||||
Export certificate as DER.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the DER file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[bytes]: DER bytes if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
der_data = base64.b64decode(self._cert_info['raw_cert'])
|
||||
if filepath:
|
||||
|
||||
@@ -4,6 +4,34 @@ import re
|
||||
|
||||
|
||||
class UserAgentGenerator:
|
||||
"""
|
||||
Generate random user agents with specified constraints.
|
||||
|
||||
Attributes:
|
||||
desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
|
||||
mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
|
||||
browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings.
|
||||
rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings.
|
||||
chrome_versions (list): A list of possible Chrome browser versions.
|
||||
firefox_versions (list): A list of possible Firefox browser versions.
|
||||
edge_versions (list): A list of possible Edge browser versions.
|
||||
safari_versions (list): A list of possible Safari browser versions.
|
||||
ios_versions (list): A list of possible iOS browser versions.
|
||||
android_versions (list): A list of possible Android browser versions.
|
||||
|
||||
Methods:
|
||||
generate_user_agent(
|
||||
platform: Literal["desktop", "mobile"] = "desktop",
|
||||
browser: str = "chrome",
|
||||
rendering_engine: str = "chrome_webkit",
|
||||
chrome_version: Optional[str] = None,
|
||||
firefox_version: Optional[str] = None,
|
||||
edge_version: Optional[str] = None,
|
||||
safari_version: Optional[str] = None,
|
||||
ios_version: Optional[str] = None,
|
||||
android_version: Optional[str] = None
|
||||
): Generates a random user agent string based on the specified parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
# Previous platform definitions remain the same...
|
||||
self.desktop_platforms = {
|
||||
@@ -105,7 +133,21 @@ class UserAgentGenerator:
|
||||
]
|
||||
|
||||
def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
|
||||
"""Get a valid combination of browser versions"""
|
||||
"""
|
||||
Get a valid combination of browser versions.
|
||||
|
||||
How it works:
|
||||
1. Check if the number of browsers is supported.
|
||||
2. Randomly choose a combination of browsers.
|
||||
3. Iterate through the combination and add browser versions.
|
||||
4. Return the browser stack.
|
||||
|
||||
Args:
|
||||
num_browsers: Number of browser specifications (1-3)
|
||||
|
||||
Returns:
|
||||
List[str]: A list of browser versions.
|
||||
"""
|
||||
if num_browsers not in self.browser_combinations:
|
||||
raise ValueError(f"Unsupported number of browsers: {num_browsers}")
|
||||
|
||||
|
||||
@@ -25,64 +25,91 @@ from functools import wraps
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
|
||||
def create_box_message(
|
||||
message: str,
|
||||
type: str = "info",
|
||||
width: int = 120,
|
||||
add_newlines: bool = True,
|
||||
double_line: bool = False
|
||||
) -> str:
|
||||
init()
|
||||
|
||||
# Define border and text colors for different types
|
||||
styles = {
|
||||
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
|
||||
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
|
||||
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
|
||||
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
|
||||
}
|
||||
|
||||
border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
|
||||
|
||||
# Define box characters based on line style
|
||||
box_chars = {
|
||||
"single": ("─", "│", "┌", "┐", "└", "┘"),
|
||||
"double": ("═", "║", "╔", "╗", "╚", "╝")
|
||||
}
|
||||
line_style = "double" if double_line else "single"
|
||||
h_line, v_line, tl, tr, bl, br = box_chars[line_style]
|
||||
|
||||
# Process lines with lighter text color
|
||||
formatted_lines = []
|
||||
raw_lines = message.split('\n')
|
||||
|
||||
if raw_lines:
|
||||
first_line = f"{prefix} {raw_lines[0].strip()}"
|
||||
wrapped_first = textwrap.fill(first_line, width=width-4)
|
||||
formatted_lines.extend(wrapped_first.split('\n'))
|
||||
|
||||
for line in raw_lines[1:]:
|
||||
if line.strip():
|
||||
wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
|
||||
formatted_lines.extend(wrapped.split('\n'))
|
||||
else:
|
||||
formatted_lines.append("")
|
||||
|
||||
# Create the box with colored borders and lighter text
|
||||
horizontal_line = h_line * (width - 1)
|
||||
box = [
|
||||
f"{border_color}{tl}{horizontal_line}{tr}",
|
||||
*[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
|
||||
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
|
||||
]
|
||||
|
||||
result = "\n".join(box)
|
||||
if add_newlines:
|
||||
result = f"\n{result}\n"
|
||||
|
||||
return result
|
||||
def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str:
|
||||
"""
|
||||
Create a styled message box with colored borders and formatted text.
|
||||
|
||||
How it works:
|
||||
1. Determines box style and colors based on the message type (e.g., info, warning).
|
||||
2. Wraps text to fit within the specified width.
|
||||
3. Constructs a box using characters (single or double lines) with appropriate formatting.
|
||||
4. Adds optional newlines before and after the box.
|
||||
|
||||
Args:
|
||||
message (str): The message to display inside the box.
|
||||
type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
|
||||
width (int): Width of the box. Defaults to 120.
|
||||
add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
|
||||
double_line (bool): Whether to use double lines for the box border. Defaults to False.
|
||||
|
||||
Returns:
|
||||
str: A formatted string containing the styled message box.
|
||||
"""
|
||||
|
||||
init()
|
||||
|
||||
# Define border and text colors for different types
|
||||
styles = {
|
||||
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
|
||||
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
|
||||
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
|
||||
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
|
||||
}
|
||||
|
||||
border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
|
||||
|
||||
# Define box characters based on line style
|
||||
box_chars = {
|
||||
"single": ("─", "│", "┌", "┐", "└", "┘"),
|
||||
"double": ("═", "║", "╔", "╗", "╚", "╝")
|
||||
}
|
||||
line_style = "double" if double_line else "single"
|
||||
h_line, v_line, tl, tr, bl, br = box_chars[line_style]
|
||||
|
||||
# Process lines with lighter text color
|
||||
formatted_lines = []
|
||||
raw_lines = message.split('\n')
|
||||
|
||||
if raw_lines:
|
||||
first_line = f"{prefix} {raw_lines[0].strip()}"
|
||||
wrapped_first = textwrap.fill(first_line, width=width-4)
|
||||
formatted_lines.extend(wrapped_first.split('\n'))
|
||||
|
||||
for line in raw_lines[1:]:
|
||||
if line.strip():
|
||||
wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
|
||||
formatted_lines.extend(wrapped.split('\n'))
|
||||
else:
|
||||
formatted_lines.append("")
|
||||
|
||||
# Create the box with colored borders and lighter text
|
||||
horizontal_line = h_line * (width - 1)
|
||||
box = [
|
||||
f"{border_color}{tl}{horizontal_line}{tr}",
|
||||
*[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
|
||||
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
|
||||
]
|
||||
|
||||
result = "\n".join(box)
|
||||
if add_newlines:
|
||||
result = f"\n{result}\n"
|
||||
|
||||
return result
|
||||
|
||||
def calculate_semaphore_count():
|
||||
"""
|
||||
Calculate the optimal semaphore count based on system resources.
|
||||
|
||||
How it works:
|
||||
1. Determines the number of CPU cores and total system memory.
|
||||
2. Sets a base count as half of the available CPU cores.
|
||||
3. Limits the count based on memory, assuming 2GB per semaphore instance.
|
||||
4. Returns the minimum value between CPU and memory-based limits.
|
||||
|
||||
Returns:
|
||||
int: The calculated semaphore count.
|
||||
"""
|
||||
|
||||
cpu_count = os.cpu_count()
|
||||
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
|
||||
base_count = max(1, cpu_count // 2)
|
||||
@@ -90,6 +117,21 @@ def calculate_semaphore_count():
|
||||
return min(base_count, memory_based_cap)
|
||||
|
||||
def get_system_memory():
|
||||
"""
|
||||
Get the total system memory in bytes.
|
||||
|
||||
How it works:
|
||||
1. Detects the operating system.
|
||||
2. Reads memory information from system-specific commands or files.
|
||||
3. Converts the memory to bytes for uniformity.
|
||||
|
||||
Returns:
|
||||
int: The total system memory in bytes.
|
||||
|
||||
Raises:
|
||||
OSError: If the operating system is unsupported.
|
||||
"""
|
||||
|
||||
system = platform.system()
|
||||
if system == "Linux":
|
||||
with open('/proc/meminfo', 'r') as mem:
|
||||
@@ -124,6 +166,18 @@ def get_system_memory():
|
||||
raise OSError("Unsupported operating system")
|
||||
|
||||
def get_home_folder():
|
||||
"""
|
||||
Get or create the home folder for Crawl4AI configuration and cache.
|
||||
|
||||
How it works:
|
||||
1. Uses environment variables or defaults to the user's home directory.
|
||||
2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
|
||||
3. Returns the path to the home folder.
|
||||
|
||||
Returns:
|
||||
str: The path to the Crawl4AI home folder.
|
||||
"""
|
||||
|
||||
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
@@ -194,6 +248,20 @@ def split_and_parse_json_objects(json_string):
|
||||
return parsed_objects, unparsed_segments
|
||||
|
||||
def sanitize_html(html):
|
||||
"""
|
||||
Sanitize an HTML string by escaping quotes.
|
||||
|
||||
How it works:
|
||||
1. Replaces all unwanted and special characters with an empty string.
|
||||
2. Escapes double and single quotes for safe usage.
|
||||
|
||||
Args:
|
||||
html (str): The HTML string to sanitize.
|
||||
|
||||
Returns:
|
||||
str: The sanitized HTML string.
|
||||
"""
|
||||
|
||||
# Replace all unwanted and special characters with an empty string
|
||||
sanitized_html = html
|
||||
# sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
||||
@@ -248,6 +316,23 @@ def escape_json_string(s):
|
||||
return s
|
||||
|
||||
def replace_inline_tags(soup, tags, only_text=False):
|
||||
"""
|
||||
Replace inline HTML tags with Markdown-style equivalents.
|
||||
|
||||
How it works:
|
||||
1. Maps specific tags (e.g., <b>, <i>) to Markdown syntax.
|
||||
2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
|
||||
3. Optionally replaces tags with their text content only.
|
||||
|
||||
Args:
|
||||
soup (BeautifulSoup): Parsed HTML content.
|
||||
tags (List[str]): List of tags to replace.
|
||||
only_text (bool): Whether to replace tags with plain text. Defaults to False.
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: Updated BeautifulSoup object with replaced tags.
|
||||
"""
|
||||
|
||||
tag_replacements = {
|
||||
'b': lambda tag: f"**{tag.text}**",
|
||||
'i': lambda tag: f"*{tag.text}*",
|
||||
@@ -292,6 +377,26 @@ def replace_inline_tags(soup, tags, only_text=False):
|
||||
# return soup
|
||||
|
||||
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
|
||||
"""
|
||||
Extract structured content, media, and links from website HTML.
|
||||
|
||||
How it works:
|
||||
1. Parses the HTML content using BeautifulSoup.
|
||||
2. Extracts internal/external links and media (images, videos, audios).
|
||||
3. Cleans the content by removing unwanted tags and attributes.
|
||||
4. Converts cleaned HTML to Markdown.
|
||||
5. Collects metadata and returns the extracted information.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
html (str): The HTML content of the website.
|
||||
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
|
||||
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
|
||||
"""
|
||||
|
||||
try:
|
||||
if not html:
|
||||
return None
|
||||
@@ -762,6 +867,27 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
}
|
||||
|
||||
def extract_metadata(html, soup=None):
|
||||
"""
|
||||
Extract optimized content, media, and links from website HTML.
|
||||
|
||||
How it works:
|
||||
1. Similar to `get_content_of_website`, but optimized for performance.
|
||||
2. Filters and scores images for usefulness.
|
||||
3. Extracts contextual descriptions for media files.
|
||||
4. Handles excluded tags and CSS selectors.
|
||||
5. Cleans HTML and converts it to Markdown.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
html (str): The HTML content of the website.
|
||||
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
|
||||
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
|
||||
**kwargs: Additional options for customization.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
|
||||
"""
|
||||
|
||||
metadata = {}
|
||||
|
||||
if not html and not soup:
|
||||
@@ -809,10 +935,35 @@ def extract_metadata(html, soup=None):
|
||||
return metadata
|
||||
|
||||
def extract_xml_tags(string):
|
||||
"""
|
||||
Extracts XML tags from a string.
|
||||
|
||||
Args:
|
||||
string (str): The input string containing XML tags.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of XML tags extracted from the input string.
|
||||
"""
|
||||
tags = re.findall(r'<(\w+)>', string)
|
||||
return list(set(tags))
|
||||
|
||||
def extract_xml_data(tags, string):
|
||||
"""
|
||||
Extract data for specified XML tags from a string.
|
||||
|
||||
How it works:
|
||||
1. Searches the string for each tag using regex.
|
||||
2. Extracts the content within the tags.
|
||||
3. Returns a dictionary of tag-content pairs.
|
||||
|
||||
Args:
|
||||
tags (List[str]): The list of XML tags to extract.
|
||||
string (str): The input string containing XML data.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
|
||||
"""
|
||||
|
||||
data = {}
|
||||
|
||||
for tag in tags:
|
||||
@@ -833,6 +984,26 @@ def perform_completion_with_backoff(
|
||||
base_url=None,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Perform an API completion request with exponential backoff.
|
||||
|
||||
How it works:
|
||||
1. Sends a completion request to the API.
|
||||
2. Retries on rate-limit errors with exponential delays.
|
||||
3. Returns the API response or an error after all retries.
|
||||
|
||||
Args:
|
||||
provider (str): The name of the API provider.
|
||||
prompt_with_variables (str): The input prompt for the completion request.
|
||||
api_token (str): The API token for authentication.
|
||||
json_response (bool): Whether to request a JSON response. Defaults to False.
|
||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||
**kwargs: Additional arguments for the API request.
|
||||
|
||||
Returns:
|
||||
dict: The API response or an error message after all retries.
|
||||
"""
|
||||
|
||||
from litellm import completion
|
||||
from litellm.exceptions import RateLimitError
|
||||
max_attempts = 3
|
||||
@@ -878,6 +1049,25 @@ def perform_completion_with_backoff(
|
||||
}]
|
||||
|
||||
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
|
||||
"""
|
||||
Extract content blocks from website HTML using an AI provider.
|
||||
|
||||
How it works:
|
||||
1. Prepares a prompt by sanitizing and escaping HTML.
|
||||
2. Sends the prompt to an AI provider with optional retries.
|
||||
3. Parses the response to extract structured blocks or errors.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
html (str): The HTML content of the website.
|
||||
provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER.
|
||||
api_token (Optional[str]): The API token for authentication. Defaults to None.
|
||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[dict]: A list of extracted content blocks.
|
||||
"""
|
||||
|
||||
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
||||
|
||||
@@ -914,6 +1104,23 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas
|
||||
return blocks
|
||||
|
||||
def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
|
||||
"""
|
||||
Extract content blocks from a batch of website HTMLs.
|
||||
|
||||
How it works:
|
||||
1. Prepares prompts for each URL and HTML pair.
|
||||
2. Sends the prompts to the AI provider in a batch request.
|
||||
3. Parses the responses to extract structured blocks or errors.
|
||||
|
||||
Args:
|
||||
batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs.
|
||||
provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192".
|
||||
api_token (Optional[str]): The API token for authentication. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[dict]: A list of extracted content blocks from all batch items.
|
||||
"""
|
||||
|
||||
api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||
from litellm import batch_completion
|
||||
messages = []
|
||||
@@ -986,6 +1193,25 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
||||
return merged_sections
|
||||
|
||||
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
|
||||
"""
|
||||
Process sections of HTML content sequentially or in parallel.
|
||||
|
||||
How it works:
|
||||
1. Sequentially processes sections with delays for "groq/" providers.
|
||||
2. Uses ThreadPoolExecutor for parallel processing with other providers.
|
||||
3. Extracts content blocks for each section.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
sections (List[str]): The list of HTML sections to process.
|
||||
provider (str): The AI provider for content extraction.
|
||||
api_token (str): The API token for authentication.
|
||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[dict]: The list of extracted content blocks from all sections.
|
||||
"""
|
||||
|
||||
extracted_content = []
|
||||
if provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
@@ -1002,6 +1228,24 @@ def process_sections(url: str, sections: list, provider: str, api_token: str, ba
|
||||
return extracted_content
|
||||
|
||||
def wrap_text(draw, text, font, max_width):
|
||||
"""
|
||||
Wrap text to fit within a specified width for rendering.
|
||||
|
||||
How it works:
|
||||
1. Splits the text into words.
|
||||
2. Constructs lines that fit within the maximum width using the provided font.
|
||||
3. Returns the wrapped text as a single string.
|
||||
|
||||
Args:
|
||||
draw (ImageDraw.Draw): The drawing context for measuring text size.
|
||||
text (str): The text to wrap.
|
||||
font (ImageFont.FreeTypeFont): The font to use for measuring text size.
|
||||
max_width (int): The maximum width for each line.
|
||||
|
||||
Returns:
|
||||
str: The wrapped text.
|
||||
"""
|
||||
|
||||
# Wrap the text to fit within the specified width
|
||||
lines = []
|
||||
words = text.split()
|
||||
@@ -1013,6 +1257,21 @@ def wrap_text(draw, text, font, max_width):
|
||||
return '\n'.join(lines)
|
||||
|
||||
def format_html(html_string):
|
||||
"""
|
||||
Prettify an HTML string using BeautifulSoup.
|
||||
|
||||
How it works:
|
||||
1. Parses the HTML string with BeautifulSoup.
|
||||
2. Formats the HTML with proper indentation.
|
||||
3. Returns the prettified HTML string.
|
||||
|
||||
Args:
|
||||
html_string (str): The HTML string to format.
|
||||
|
||||
Returns:
|
||||
str: The prettified HTML string.
|
||||
"""
|
||||
|
||||
soup = BeautifulSoup(html_string, 'lxml.parser')
|
||||
return soup.prettify()
|
||||
|
||||
@@ -1110,7 +1369,20 @@ def normalize_url_tmp(href, base_url):
|
||||
return href.strip()
|
||||
|
||||
def get_base_domain(url: str) -> str:
|
||||
"""Extract base domain from URL, handling various edge cases."""
|
||||
"""
|
||||
Extract the base domain from a given URL, handling common edge cases.
|
||||
|
||||
How it works:
|
||||
1. Parses the URL to extract the domain.
|
||||
2. Removes the port number and 'www' prefix.
|
||||
3. Handles special domains (e.g., 'co.uk') to extract the correct base.
|
||||
|
||||
Args:
|
||||
url (str): The URL to extract the base domain from.
|
||||
|
||||
Returns:
|
||||
str: The extracted base domain or an empty string if parsing fails.
|
||||
"""
|
||||
try:
|
||||
# Get domain from URL
|
||||
domain = urlparse(url).netloc.lower()
|
||||
@@ -1136,7 +1408,20 @@ def get_base_domain(url: str) -> str:
|
||||
return ""
|
||||
|
||||
def is_external_url(url: str, base_domain: str) -> bool:
|
||||
"""Check if URL is external to base domain."""
|
||||
"""
|
||||
Extract the base domain from a given URL, handling common edge cases.
|
||||
|
||||
How it works:
|
||||
1. Parses the URL to extract the domain.
|
||||
2. Removes the port number and 'www' prefix.
|
||||
3. Handles special domains (e.g., 'co.uk') to extract the correct base.
|
||||
|
||||
Args:
|
||||
url (str): The URL to extract the base domain from.
|
||||
|
||||
Returns:
|
||||
str: The extracted base domain or an empty string if parsing fails.
|
||||
"""
|
||||
special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
|
||||
if any(url.lower().startswith(p) for p in special):
|
||||
return True
|
||||
@@ -1155,8 +1440,22 @@ def is_external_url(url: str, base_domain: str) -> bool:
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def clean_tokens(tokens: list[str]) -> list[str]:
|
||||
"""
|
||||
Clean a list of tokens by removing noise, stop words, and short tokens.
|
||||
|
||||
How it works:
|
||||
1. Defines a set of noise words and stop words.
|
||||
2. Filters tokens based on length and exclusion criteria.
|
||||
3. Excludes tokens starting with certain symbols (e.g., "↑", "▲").
|
||||
|
||||
Args:
|
||||
tokens (list[str]): The list of tokens to clean.
|
||||
|
||||
Returns:
|
||||
list[str]: The cleaned list of tokens.
|
||||
"""
|
||||
|
||||
# Set of tokens to remove
|
||||
noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'}
|
||||
|
||||
@@ -1212,6 +1511,21 @@ def clean_tokens(tokens: list[str]) -> list[str]:
|
||||
and not token.startswith('⬆')]
|
||||
|
||||
def profile_and_time(func):
|
||||
"""
|
||||
Decorator to profile a function's execution time and performance.
|
||||
|
||||
How it works:
|
||||
1. Records the start time before executing the function.
|
||||
2. Profiles the function's execution using `cProfile`.
|
||||
3. Prints the elapsed time and profiling statistics.
|
||||
|
||||
Args:
|
||||
func (Callable): The function to decorate.
|
||||
|
||||
Returns:
|
||||
Callable: The decorated function with profiling and timing enabled.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
# Start timer
|
||||
|
||||
Reference in New Issue
Block a user