Enhance Markdown generation and external content control
- Integrate customized html2text library for flexible Markdown output - Add options to exclude external links and images - Improve content scraping efficiency and error handling - Update AsyncPlaywrightCrawlerStrategy for faster closing - Enhance CosineStrategy with generic embedding model loading
This commit is contained in:
25
CHANGELOG.md
25
CHANGELOG.md
@@ -1,5 +1,30 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.3.72] - 2024-10-20
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Forked and integrated a customized version of the html2text library for more control over Markdown generation
|
||||||
|
- New configuration options for controlling external content:
|
||||||
|
- Ability to exclude all external links
|
||||||
|
- Option to specify domains to exclude (default includes major social media platforms)
|
||||||
|
- Control over excluding external images
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Improved Markdown generation process:
|
||||||
|
- Added fine-grained control over character escaping in Markdown output
|
||||||
|
- Enhanced handling of code blocks and pre-formatted text
|
||||||
|
- Updated `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500)
|
||||||
|
- Enhanced flexibility in `CosineStrategy` with a more generic `load_HF_embedding_model` function
|
||||||
|
|
||||||
|
### Improved
|
||||||
|
- Optimized content scraping and processing for better efficiency
|
||||||
|
- Enhanced error handling and logging in various components
|
||||||
|
|
||||||
|
### Developer Notes
|
||||||
|
- The customized html2text library is now located within the crawl4ai package
|
||||||
|
- New configuration options are available in the `config.py` file for external content handling
|
||||||
|
- The `WebScrappingStrategy` class has been updated to accommodate new external content exclusion options
|
||||||
|
|
||||||
## [v0.3.71] - 2024-10-19
|
## [v0.3.71] - 2024-10-19
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
@@ -396,6 +396,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
response_headers = {}
|
response_headers = {}
|
||||||
|
|
||||||
await page.wait_for_selector('body')
|
await page.wait_for_selector('body')
|
||||||
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
|
||||||
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||||
@@ -477,7 +478,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
});
|
});
|
||||||
|
|
||||||
// Fallback timeout of 5 seconds
|
// Fallback timeout of 5 seconds
|
||||||
setTimeout(() => resolve(), 5000);
|
// setTimeout(() => resolve(), 5000);
|
||||||
|
resolve();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -186,7 +186,8 @@ class AsyncWebCrawler:
|
|||||||
try:
|
try:
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
scrapping_strategy = WebScrappingStrategy()
|
scrapping_strategy = WebScrappingStrategy()
|
||||||
result = await scrapping_strategy.ascrap(
|
# result = await scrapping_strategy.ascrap(
|
||||||
|
result = await scrapping_strategy.scrap(
|
||||||
url,
|
url,
|
||||||
html,
|
html,
|
||||||
word_count_threshold=word_count_threshold,
|
word_count_threshold=word_count_threshold,
|
||||||
|
|||||||
@@ -4,22 +4,21 @@ from dotenv import load_dotenv
|
|||||||
load_dotenv() # Load environment variables from .env file
|
load_dotenv() # Load environment variables from .env file
|
||||||
|
|
||||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
|
||||||
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||||
PROVIDER_MODELS = {
|
PROVIDER_MODELS = {
|
||||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||||
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
||||||
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
||||||
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
|
"openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
|
||||||
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
|
|
||||||
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
||||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||||
|
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# Chunk token threshold
|
# Chunk token threshold
|
||||||
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
|
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
|
||||||
OVERLAP_RATE = 0.1
|
OVERLAP_RATE = 0.1
|
||||||
@@ -29,6 +28,27 @@ WORD_TOKEN_RATE = 1.3
|
|||||||
MIN_WORD_THRESHOLD = 1
|
MIN_WORD_THRESHOLD = 1
|
||||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
|
||||||
|
|
||||||
|
IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height']
|
||||||
|
ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
|
||||||
|
SOCIAL_MEDIA_DOMAINS = [
|
||||||
|
'facebook.com',
|
||||||
|
'twitter.com',
|
||||||
|
'x.com',
|
||||||
|
'linkedin.com',
|
||||||
|
'instagram.com',
|
||||||
|
'pinterest.com',
|
||||||
|
'youtube.com',
|
||||||
|
'tiktok.com',
|
||||||
|
'snapchat.com',
|
||||||
|
'whatsapp.com',
|
||||||
|
'messenger.com',
|
||||||
|
'reddit.com',
|
||||||
|
'tumblr.com',
|
||||||
|
'buffer.com',
|
||||||
|
'xing.com',
|
||||||
|
'flipboard.com',
|
||||||
|
]
|
||||||
|
|
||||||
# Threshold for the Image extraction - Range is 1 to 6
|
# Threshold for the Image extraction - Range is 1 to 6
|
||||||
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
||||||
# to each image based on the following aspects.
|
# to each image based on the following aspects.
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
body = soup.body
|
body = soup.body
|
||||||
|
|
||||||
|
|
||||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||||
|
|
||||||
for tag in kwargs.get('excluded_tags', []) or []:
|
for tag in kwargs.get('excluded_tags', []) or []:
|
||||||
@@ -150,6 +151,8 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
score+=1
|
score+=1
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||||
return None
|
return None
|
||||||
score = score_image_for_usefulness(img, url, index, total_images)
|
score = score_image_for_usefulness(img, url, index, total_images)
|
||||||
@@ -164,6 +167,19 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
'type': 'image'
|
'type': 'image'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False):
|
||||||
|
attrs_to_remove = []
|
||||||
|
for attr in element.attrs:
|
||||||
|
if attr not in important_attrs:
|
||||||
|
if keep_data_attributes:
|
||||||
|
if not attr.startswith('data-'):
|
||||||
|
attrs_to_remove.append(attr)
|
||||||
|
else:
|
||||||
|
attrs_to_remove.append(attr)
|
||||||
|
|
||||||
|
for attr in attrs_to_remove:
|
||||||
|
del element[attr]
|
||||||
|
|
||||||
def process_element(element: element.PageElement) -> bool:
|
def process_element(element: element.PageElement) -> bool:
|
||||||
try:
|
try:
|
||||||
if isinstance(element, NavigableString):
|
if isinstance(element, NavigableString):
|
||||||
@@ -191,7 +207,38 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
links['internal'].append(link_data)
|
links['internal'].append(link_data)
|
||||||
keep_element = True
|
keep_element = True
|
||||||
|
|
||||||
|
if kwargs.get('exclude_external_links', True):
|
||||||
|
href_url_base = href.split('/')[2]
|
||||||
|
if url_base not in href_url_base:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if we should esclude links to all major social media platforms
|
||||||
|
if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
|
||||||
|
social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
|
||||||
|
social_media_domains = list(set(social_media_domains))
|
||||||
|
if any(domain in href for domain in social_media_domains):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
elif element.name == 'img':
|
elif element.name == 'img':
|
||||||
|
# Check flag if we should remove external images
|
||||||
|
if kwargs.get('exclude_external_images', False):
|
||||||
|
src = element.get('src', '')
|
||||||
|
src_url_base = src.split('/')[2]
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if url_base not in src_url_base:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True):
|
||||||
|
src = element.get('src', '')
|
||||||
|
src_url_base = src.split('/')[2]
|
||||||
|
url_base = url.split('/')[2]
|
||||||
|
if any(domain in src for domain in SOCIAL_MEDIA_DOMAINS):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
return True # Always keep image elements
|
return True # Always keep image elements
|
||||||
|
|
||||||
elif element.name in ['video', 'audio']:
|
elif element.name in ['video', 'audio']:
|
||||||
@@ -211,14 +258,17 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
})
|
})
|
||||||
return True # Always keep video and audio elements
|
return True # Always keep video and audio elements
|
||||||
|
|
||||||
if element.name != 'pre':
|
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||||
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
|
if kwargs.get('only_text', False):
|
||||||
if kwargs.get('only_text', False):
|
element.replace_with(element.get_text())
|
||||||
element.replace_with(element.get_text())
|
|
||||||
else:
|
remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
||||||
element.unwrap()
|
# for attr in element.attrs:
|
||||||
elif element.name != 'img':
|
# if attr not in IMPORTANT_ATTRS or (attr.startswith('data-') and not kwargs.get('keep_data_attributes', False)):
|
||||||
element.attrs = {}
|
# del element[attr]
|
||||||
|
|
||||||
|
# Print element name and attributes
|
||||||
|
print(element.name, element.attrs)
|
||||||
|
|
||||||
# Process children
|
# Process children
|
||||||
for child in list(element.children):
|
for child in list(element.children):
|
||||||
@@ -254,7 +304,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
process_element(body)
|
process_element(body)
|
||||||
|
|
||||||
# # Process images using ThreadPoolExecutor
|
# # Process images using ThreadPoolExecutor
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
imgs = body.find_all('img')
|
imgs = body.find_all('img')
|
||||||
|
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs)))
|
image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs)))
|
||||||
media['images'] = [result for result in image_results if result is not None]
|
media['images'] = [result for result in image_results if result is not None]
|
||||||
@@ -307,10 +361,9 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
|
|
||||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||||
|
|
||||||
h = CustomHTML2Text()
|
|
||||||
h.ignore_links = not kwargs.get('include_links_on_markdown', False)
|
|
||||||
h.body_width = 0
|
|
||||||
try:
|
try:
|
||||||
|
h = CustomHTML2Text()
|
||||||
|
h.update_params(**kwargs.get('html2text', {}))
|
||||||
markdown = h.handle(cleaned_html)
|
markdown = h.handle(cleaned_html)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
markdown = h.handle(sanitize_html(cleaned_html))
|
markdown = h.handle(sanitize_html(cleaned_html))
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY")
|
self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
|
||||||
self.instruction = instruction
|
self.instruction = instruction
|
||||||
self.extract_type = extraction_type
|
self.extract_type = extraction_type
|
||||||
self.schema = schema
|
self.schema = schema
|
||||||
|
|||||||
1015
crawl4ai/html2text/__init__.py
Normal file
1015
crawl4ai/html2text/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
3
crawl4ai/html2text/__main__.py
Normal file
3
crawl4ai/html2text/__main__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .cli import main
|
||||||
|
|
||||||
|
main()
|
||||||
2
crawl4ai/html2text/_typing.py
Normal file
2
crawl4ai/html2text/_typing.py
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
class OutCallback:
|
||||||
|
def __call__(self, s: str) -> None: ...
|
||||||
330
crawl4ai/html2text/cli.py
Normal file
330
crawl4ai/html2text/cli.py
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from . import HTML2Text, __version__, config
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
baseurl = ""
|
||||||
|
|
||||||
|
class bcolors:
|
||||||
|
HEADER = "\033[95m"
|
||||||
|
OKBLUE = "\033[94m"
|
||||||
|
OKGREEN = "\033[92m"
|
||||||
|
WARNING = "\033[93m"
|
||||||
|
FAIL = "\033[91m"
|
||||||
|
ENDC = "\033[0m"
|
||||||
|
BOLD = "\033[1m"
|
||||||
|
UNDERLINE = "\033[4m"
|
||||||
|
|
||||||
|
p = argparse.ArgumentParser()
|
||||||
|
p.add_argument(
|
||||||
|
"--default-image-alt",
|
||||||
|
dest="default_image_alt",
|
||||||
|
default=config.DEFAULT_IMAGE_ALT,
|
||||||
|
help="The default alt string for images with missing ones",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--pad-tables",
|
||||||
|
dest="pad_tables",
|
||||||
|
action="store_true",
|
||||||
|
default=config.PAD_TABLES,
|
||||||
|
help="pad the cells to equal column width in tables",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-wrap-links",
|
||||||
|
dest="wrap_links",
|
||||||
|
action="store_false",
|
||||||
|
default=config.WRAP_LINKS,
|
||||||
|
help="don't wrap links during conversion",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--wrap-list-items",
|
||||||
|
dest="wrap_list_items",
|
||||||
|
action="store_true",
|
||||||
|
default=config.WRAP_LIST_ITEMS,
|
||||||
|
help="wrap list items during conversion",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--wrap-tables",
|
||||||
|
dest="wrap_tables",
|
||||||
|
action="store_true",
|
||||||
|
default=config.WRAP_TABLES,
|
||||||
|
help="wrap tables",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-emphasis",
|
||||||
|
dest="ignore_emphasis",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_EMPHASIS,
|
||||||
|
help="don't include any formatting for emphasis",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--reference-links",
|
||||||
|
dest="inline_links",
|
||||||
|
action="store_false",
|
||||||
|
default=config.INLINE_LINKS,
|
||||||
|
help="use reference style links instead of inline links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-links",
|
||||||
|
dest="ignore_links",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_ANCHORS,
|
||||||
|
help="don't include any formatting for links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-mailto-links",
|
||||||
|
action="store_true",
|
||||||
|
dest="ignore_mailto_links",
|
||||||
|
default=config.IGNORE_MAILTO_LINKS,
|
||||||
|
help="don't include mailto: links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--protect-links",
|
||||||
|
dest="protect_links",
|
||||||
|
action="store_true",
|
||||||
|
default=config.PROTECT_LINKS,
|
||||||
|
help="protect links from line breaks surrounding them with angle brackets",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-images",
|
||||||
|
dest="ignore_images",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_IMAGES,
|
||||||
|
help="don't include any formatting for images",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--images-as-html",
|
||||||
|
dest="images_as_html",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_AS_HTML,
|
||||||
|
help=(
|
||||||
|
"Always write image tags as raw html; preserves `height`, `width` and "
|
||||||
|
"`alt` if possible."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--images-to-alt",
|
||||||
|
dest="images_to_alt",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_TO_ALT,
|
||||||
|
help="Discard image data, only keep alt text",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--images-with-size",
|
||||||
|
dest="images_with_size",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_WITH_SIZE,
|
||||||
|
help=(
|
||||||
|
"Write image tags with height and width attrs as raw html to retain "
|
||||||
|
"dimensions"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-g",
|
||||||
|
"--google-doc",
|
||||||
|
action="store_true",
|
||||||
|
dest="google_doc",
|
||||||
|
default=False,
|
||||||
|
help="convert an html-exported Google Document",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--dash-unordered-list",
|
||||||
|
action="store_true",
|
||||||
|
dest="ul_style_dash",
|
||||||
|
default=False,
|
||||||
|
help="use a dash rather than a star for unordered list items",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-e",
|
||||||
|
"--asterisk-emphasis",
|
||||||
|
action="store_true",
|
||||||
|
dest="em_style_asterisk",
|
||||||
|
default=False,
|
||||||
|
help="use an asterisk rather than an underscore for emphasized text",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-b",
|
||||||
|
"--body-width",
|
||||||
|
dest="body_width",
|
||||||
|
type=int,
|
||||||
|
default=config.BODY_WIDTH,
|
||||||
|
help="number of characters per output line, 0 for no wrap",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-i",
|
||||||
|
"--google-list-indent",
|
||||||
|
dest="list_indent",
|
||||||
|
type=int,
|
||||||
|
default=config.GOOGLE_LIST_INDENT,
|
||||||
|
help="number of pixels Google indents nested lists",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--hide-strikethrough",
|
||||||
|
action="store_true",
|
||||||
|
dest="hide_strikethrough",
|
||||||
|
default=False,
|
||||||
|
help="hide strike-through text. only relevant when -g is " "specified as well",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--escape-all",
|
||||||
|
action="store_true",
|
||||||
|
dest="escape_snob",
|
||||||
|
default=False,
|
||||||
|
help=(
|
||||||
|
"Escape all special characters. Output is less readable, but avoids "
|
||||||
|
"corner case formatting issues."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--bypass-tables",
|
||||||
|
action="store_true",
|
||||||
|
dest="bypass_tables",
|
||||||
|
default=config.BYPASS_TABLES,
|
||||||
|
help="Format tables in HTML rather than Markdown syntax.",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--ignore-tables",
|
||||||
|
action="store_true",
|
||||||
|
dest="ignore_tables",
|
||||||
|
default=config.IGNORE_TABLES,
|
||||||
|
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--single-line-break",
|
||||||
|
action="store_true",
|
||||||
|
dest="single_line_break",
|
||||||
|
default=config.SINGLE_LINE_BREAK,
|
||||||
|
help=(
|
||||||
|
"Use a single line break after a block element rather than two line "
|
||||||
|
"breaks. NOTE: Requires --body-width=0"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--unicode-snob",
|
||||||
|
action="store_true",
|
||||||
|
dest="unicode_snob",
|
||||||
|
default=config.UNICODE_SNOB,
|
||||||
|
help="Use unicode throughout document",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-automatic-links",
|
||||||
|
action="store_false",
|
||||||
|
dest="use_automatic_links",
|
||||||
|
default=config.USE_AUTOMATIC_LINKS,
|
||||||
|
help="Do not use automatic links wherever applicable",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-skip-internal-links",
|
||||||
|
action="store_false",
|
||||||
|
dest="skip_internal_links",
|
||||||
|
default=config.SKIP_INTERNAL_LINKS,
|
||||||
|
help="Do not skip internal links",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--links-after-para",
|
||||||
|
action="store_true",
|
||||||
|
dest="links_each_paragraph",
|
||||||
|
default=config.LINKS_EACH_PARAGRAPH,
|
||||||
|
help="Put links after each paragraph instead of document",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--mark-code",
|
||||||
|
action="store_true",
|
||||||
|
dest="mark_code",
|
||||||
|
default=config.MARK_CODE,
|
||||||
|
help="Mark program code blocks with [code]...[/code]",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--decode-errors",
|
||||||
|
dest="decode_errors",
|
||||||
|
default=config.DECODE_ERRORS,
|
||||||
|
help=(
|
||||||
|
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
|
||||||
|
"acceptable values"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--open-quote",
|
||||||
|
dest="open_quote",
|
||||||
|
default=config.OPEN_QUOTE,
|
||||||
|
help="The character used to open quotes",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--close-quote",
|
||||||
|
dest="close_quote",
|
||||||
|
default=config.CLOSE_QUOTE,
|
||||||
|
help="The character used to close quotes",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--version", action="version", version=".".join(map(str, __version__))
|
||||||
|
)
|
||||||
|
p.add_argument("filename", nargs="?")
|
||||||
|
p.add_argument("encoding", nargs="?", default="utf-8")
|
||||||
|
p.add_argument(
|
||||||
|
"--include-sup-sub",
|
||||||
|
dest="include_sup_sub",
|
||||||
|
action="store_true",
|
||||||
|
default=config.INCLUDE_SUP_SUB,
|
||||||
|
help="Include the sup and sub tags",
|
||||||
|
)
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
if args.filename and args.filename != "-":
|
||||||
|
with open(args.filename, "rb") as fp:
|
||||||
|
data = fp.read()
|
||||||
|
else:
|
||||||
|
data = sys.stdin.buffer.read()
|
||||||
|
|
||||||
|
try:
|
||||||
|
html = data.decode(args.encoding, args.decode_errors)
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
|
||||||
|
warning += " Use the " + bcolors.OKGREEN
|
||||||
|
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
|
||||||
|
print(warning)
|
||||||
|
raise err
|
||||||
|
|
||||||
|
h = HTML2Text(baseurl=baseurl)
|
||||||
|
# handle options
|
||||||
|
if args.ul_style_dash:
|
||||||
|
h.ul_item_mark = "-"
|
||||||
|
if args.em_style_asterisk:
|
||||||
|
h.emphasis_mark = "*"
|
||||||
|
h.strong_mark = "__"
|
||||||
|
|
||||||
|
h.body_width = args.body_width
|
||||||
|
h.google_list_indent = args.list_indent
|
||||||
|
h.ignore_emphasis = args.ignore_emphasis
|
||||||
|
h.ignore_links = args.ignore_links
|
||||||
|
h.ignore_mailto_links = args.ignore_mailto_links
|
||||||
|
h.protect_links = args.protect_links
|
||||||
|
h.ignore_images = args.ignore_images
|
||||||
|
h.images_as_html = args.images_as_html
|
||||||
|
h.images_to_alt = args.images_to_alt
|
||||||
|
h.images_with_size = args.images_with_size
|
||||||
|
h.google_doc = args.google_doc
|
||||||
|
h.hide_strikethrough = args.hide_strikethrough
|
||||||
|
h.escape_snob = args.escape_snob
|
||||||
|
h.bypass_tables = args.bypass_tables
|
||||||
|
h.ignore_tables = args.ignore_tables
|
||||||
|
h.single_line_break = args.single_line_break
|
||||||
|
h.inline_links = args.inline_links
|
||||||
|
h.unicode_snob = args.unicode_snob
|
||||||
|
h.use_automatic_links = args.use_automatic_links
|
||||||
|
h.skip_internal_links = args.skip_internal_links
|
||||||
|
h.links_each_paragraph = args.links_each_paragraph
|
||||||
|
h.mark_code = args.mark_code
|
||||||
|
h.wrap_links = args.wrap_links
|
||||||
|
h.wrap_list_items = args.wrap_list_items
|
||||||
|
h.wrap_tables = args.wrap_tables
|
||||||
|
h.pad_tables = args.pad_tables
|
||||||
|
h.default_image_alt = args.default_image_alt
|
||||||
|
h.open_quote = args.open_quote
|
||||||
|
h.close_quote = args.close_quote
|
||||||
|
h.include_sup_sub = args.include_sup_sub
|
||||||
|
|
||||||
|
sys.stdout.write(h.handle(html))
|
||||||
172
crawl4ai/html2text/config.py
Normal file
172
crawl4ai/html2text/config.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
# Use Unicode characters instead of their ascii pseudo-replacements
|
||||||
|
UNICODE_SNOB = False
|
||||||
|
|
||||||
|
# Marker to use for marking tables for padding post processing
|
||||||
|
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
||||||
|
# Escape all special characters. Output is less readable, but avoids
|
||||||
|
# corner case formatting issues.
|
||||||
|
ESCAPE_SNOB = False
|
||||||
|
ESCAPE_BACKSLASH = False
|
||||||
|
ESCAPE_DOT = False
|
||||||
|
ESCAPE_PLUS = False
|
||||||
|
ESCAPE_DASH = False
|
||||||
|
|
||||||
|
# Put the links after each paragraph instead of at the end.
|
||||||
|
LINKS_EACH_PARAGRAPH = False
|
||||||
|
|
||||||
|
# Wrap long lines at position. 0 for no wrapping.
|
||||||
|
BODY_WIDTH = 78
|
||||||
|
|
||||||
|
# Don't show internal links (href="#local-anchor") -- corresponding link
|
||||||
|
# targets won't be visible in the plain text file anyway.
|
||||||
|
SKIP_INTERNAL_LINKS = True
|
||||||
|
|
||||||
|
# Use inline, rather than reference, formatting for images and links
|
||||||
|
INLINE_LINKS = True
|
||||||
|
|
||||||
|
# Protect links from line breaks surrounding them with angle brackets (in
|
||||||
|
# addition to their square brackets)
|
||||||
|
PROTECT_LINKS = False
|
||||||
|
# WRAP_LINKS = True
|
||||||
|
WRAP_LINKS = True
|
||||||
|
|
||||||
|
# Wrap list items.
|
||||||
|
WRAP_LIST_ITEMS = False
|
||||||
|
|
||||||
|
# Wrap tables
|
||||||
|
WRAP_TABLES = False
|
||||||
|
|
||||||
|
# Number of pixels Google indents nested lists
|
||||||
|
GOOGLE_LIST_INDENT = 36
|
||||||
|
|
||||||
|
# Values Google and others may use to indicate bold text
|
||||||
|
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
|
||||||
|
|
||||||
|
IGNORE_ANCHORS = False
|
||||||
|
IGNORE_MAILTO_LINKS = False
|
||||||
|
IGNORE_IMAGES = False
|
||||||
|
IMAGES_AS_HTML = False
|
||||||
|
IMAGES_TO_ALT = False
|
||||||
|
IMAGES_WITH_SIZE = False
|
||||||
|
IGNORE_EMPHASIS = False
|
||||||
|
MARK_CODE = False
|
||||||
|
DECODE_ERRORS = "strict"
|
||||||
|
DEFAULT_IMAGE_ALT = ""
|
||||||
|
PAD_TABLES = False
|
||||||
|
|
||||||
|
# Convert links with same href and text to <href> format
|
||||||
|
# if they are absolute links
|
||||||
|
USE_AUTOMATIC_LINKS = True
|
||||||
|
|
||||||
|
# For checking space-only lines on line 771
|
||||||
|
RE_SPACE = re.compile(r"\s\+")
|
||||||
|
|
||||||
|
RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
|
||||||
|
RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
|
||||||
|
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
|
||||||
|
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
||||||
|
|
||||||
|
# to find links in the text
|
||||||
|
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
|
||||||
|
|
||||||
|
# to find table separators
|
||||||
|
RE_TABLE = re.compile(r" \| ")
|
||||||
|
|
||||||
|
RE_MD_DOT_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
^ # start of line
|
||||||
|
(\s*\d+) # optional whitespace and a number
|
||||||
|
(\.) # dot
|
||||||
|
(?=\s) # lookahead assert whitespace
|
||||||
|
""",
|
||||||
|
re.MULTILINE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
RE_MD_PLUS_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
(\s*)
|
||||||
|
(\+)
|
||||||
|
(?=\s)
|
||||||
|
""",
|
||||||
|
flags=re.MULTILINE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
RE_MD_DASH_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
^
|
||||||
|
(\s*)
|
||||||
|
(-)
|
||||||
|
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
||||||
|
# or another dash (header or hr)
|
||||||
|
""",
|
||||||
|
flags=re.MULTILINE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
|
||||||
|
RE_MD_BACKSLASH_MATCHER = re.compile(
|
||||||
|
r"""
|
||||||
|
(\\) # match one slash
|
||||||
|
(?=[%s]) # followed by a char that requires escaping
|
||||||
|
"""
|
||||||
|
% re.escape(RE_SLASH_CHARS),
|
||||||
|
flags=re.VERBOSE,
|
||||||
|
)
|
||||||
|
|
||||||
|
UNIFIABLE = {
|
||||||
|
"rsquo": "'",
|
||||||
|
"lsquo": "'",
|
||||||
|
"rdquo": '"',
|
||||||
|
"ldquo": '"',
|
||||||
|
"copy": "(C)",
|
||||||
|
"mdash": "--",
|
||||||
|
"nbsp": " ",
|
||||||
|
"rarr": "->",
|
||||||
|
"larr": "<-",
|
||||||
|
"middot": "*",
|
||||||
|
"ndash": "-",
|
||||||
|
"oelig": "oe",
|
||||||
|
"aelig": "ae",
|
||||||
|
"agrave": "a",
|
||||||
|
"aacute": "a",
|
||||||
|
"acirc": "a",
|
||||||
|
"atilde": "a",
|
||||||
|
"auml": "a",
|
||||||
|
"aring": "a",
|
||||||
|
"egrave": "e",
|
||||||
|
"eacute": "e",
|
||||||
|
"ecirc": "e",
|
||||||
|
"euml": "e",
|
||||||
|
"igrave": "i",
|
||||||
|
"iacute": "i",
|
||||||
|
"icirc": "i",
|
||||||
|
"iuml": "i",
|
||||||
|
"ograve": "o",
|
||||||
|
"oacute": "o",
|
||||||
|
"ocirc": "o",
|
||||||
|
"otilde": "o",
|
||||||
|
"ouml": "o",
|
||||||
|
"ugrave": "u",
|
||||||
|
"uacute": "u",
|
||||||
|
"ucirc": "u",
|
||||||
|
"uuml": "u",
|
||||||
|
"lrm": "",
|
||||||
|
"rlm": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Format tables in HTML rather than Markdown syntax
|
||||||
|
BYPASS_TABLES = False
|
||||||
|
# Ignore table-related tags (table, th, td, tr) while keeping rows
|
||||||
|
IGNORE_TABLES = False
|
||||||
|
|
||||||
|
|
||||||
|
# Use a single line break after a block element rather than two line breaks.
|
||||||
|
# NOTE: Requires body width setting to be 0.
|
||||||
|
SINGLE_LINE_BREAK = False
|
||||||
|
|
||||||
|
|
||||||
|
# Use double quotation marks when converting the <q> tag.
|
||||||
|
OPEN_QUOTE = '"'
|
||||||
|
CLOSE_QUOTE = '"'
|
||||||
|
|
||||||
|
# Include the <sup> and <sub> tags
|
||||||
|
INCLUDE_SUP_SUB = False
|
||||||
18
crawl4ai/html2text/elements.py
Normal file
18
crawl4ai/html2text/elements.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class AnchorElement:
|
||||||
|
__slots__ = ["attrs", "count", "outcount"]
|
||||||
|
|
||||||
|
def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
|
||||||
|
self.attrs = attrs
|
||||||
|
self.count = count
|
||||||
|
self.outcount = outcount
|
||||||
|
|
||||||
|
|
||||||
|
class ListElement:
|
||||||
|
__slots__ = ["name", "num"]
|
||||||
|
|
||||||
|
def __init__(self, name: str, num: int):
|
||||||
|
self.name = name
|
||||||
|
self.num = num
|
||||||
303
crawl4ai/html2text/utils.py
Normal file
303
crawl4ai/html2text/utils.py
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
import html.entities
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
from . import config
|
||||||
|
|
||||||
|
unifiable_n = {
|
||||||
|
html.entities.name2codepoint[k]: v
|
||||||
|
for k, v in config.UNIFIABLE.items()
|
||||||
|
if k != "nbsp"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def hn(tag: str) -> int:
|
||||||
|
if tag[0] == "h" and len(tag) == 2:
|
||||||
|
n = tag[1]
|
||||||
|
if "0" < n <= "9":
|
||||||
|
return int(n)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def dumb_property_dict(style: str) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
:returns: A hash of css attributes
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
x.strip().lower(): y.strip().lower()
|
||||||
|
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
:type data: str
|
||||||
|
|
||||||
|
:returns: A hash of css selectors, each of which contains a hash of
|
||||||
|
css attributes.
|
||||||
|
:rtype: dict
|
||||||
|
"""
|
||||||
|
# remove @import sentences
|
||||||
|
data += ";"
|
||||||
|
importIndex = data.find("@import")
|
||||||
|
while importIndex != -1:
|
||||||
|
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
|
||||||
|
importIndex = data.find("@import")
|
||||||
|
|
||||||
|
# parse the css. reverted from dictionary comprehension in order to
|
||||||
|
# support older pythons
|
||||||
|
pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
|
||||||
|
try:
|
||||||
|
elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
|
||||||
|
except ValueError:
|
||||||
|
elements = {} # not that important
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
def element_style(
|
||||||
|
attrs: Dict[str, Optional[str]],
|
||||||
|
style_def: Dict[str, Dict[str, str]],
|
||||||
|
parent_style: Dict[str, str],
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
:type attrs: dict
|
||||||
|
:type style_def: dict
|
||||||
|
:type style_def: dict
|
||||||
|
|
||||||
|
:returns: A hash of the 'final' style attributes of the element
|
||||||
|
:rtype: dict
|
||||||
|
"""
|
||||||
|
style = parent_style.copy()
|
||||||
|
if "class" in attrs:
|
||||||
|
assert attrs["class"] is not None
|
||||||
|
for css_class in attrs["class"].split():
|
||||||
|
css_style = style_def.get("." + css_class, {})
|
||||||
|
style.update(css_style)
|
||||||
|
if "style" in attrs:
|
||||||
|
assert attrs["style"] is not None
|
||||||
|
immediate_style = dumb_property_dict(attrs["style"])
|
||||||
|
style.update(immediate_style)
|
||||||
|
|
||||||
|
return style
|
||||||
|
|
||||||
|
|
||||||
|
def google_list_style(style: Dict[str, str]) -> str:
|
||||||
|
"""
|
||||||
|
Finds out whether this is an ordered or unordered list
|
||||||
|
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
if "list-style-type" in style:
|
||||||
|
list_style = style["list-style-type"]
|
||||||
|
if list_style in ["disc", "circle", "square", "none"]:
|
||||||
|
return "ul"
|
||||||
|
|
||||||
|
return "ol"
|
||||||
|
|
||||||
|
|
||||||
|
def google_has_height(style: Dict[str, str]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the style of the element has the 'height' attribute
|
||||||
|
explicitly defined
|
||||||
|
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
return "height" in style
|
||||||
|
|
||||||
|
|
||||||
|
def google_text_emphasis(style: Dict[str, str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:returns: A list of all emphasis modifiers of the element
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
emphasis = []
|
||||||
|
if "text-decoration" in style:
|
||||||
|
emphasis.append(style["text-decoration"])
|
||||||
|
if "font-style" in style:
|
||||||
|
emphasis.append(style["font-style"])
|
||||||
|
if "font-weight" in style:
|
||||||
|
emphasis.append(style["font-weight"])
|
||||||
|
|
||||||
|
return emphasis
|
||||||
|
|
||||||
|
|
||||||
|
def google_fixed_width_font(style: Dict[str, str]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the css of the current element defines a fixed width font
|
||||||
|
|
||||||
|
:type style: dict
|
||||||
|
|
||||||
|
:rtype: bool
|
||||||
|
"""
|
||||||
|
font_family = ""
|
||||||
|
if "font-family" in style:
|
||||||
|
font_family = style["font-family"]
|
||||||
|
return "courier new" == font_family or "consolas" == font_family
|
||||||
|
|
||||||
|
|
||||||
|
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
|
||||||
|
"""
|
||||||
|
Extract numbering from list element attributes
|
||||||
|
|
||||||
|
:type attrs: dict
|
||||||
|
|
||||||
|
:rtype: int or None
|
||||||
|
"""
|
||||||
|
if "start" in attrs:
|
||||||
|
assert attrs["start"] is not None
|
||||||
|
try:
|
||||||
|
return int(attrs["start"]) - 1
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def skipwrap(
|
||||||
|
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
|
||||||
|
) -> bool:
|
||||||
|
# If it appears to contain a link
|
||||||
|
# don't wrap
|
||||||
|
if not wrap_links and config.RE_LINK.search(para):
|
||||||
|
return True
|
||||||
|
# If the text begins with four spaces or one tab, it's a code block;
|
||||||
|
# don't wrap
|
||||||
|
if para[0:4] == " " or para[0] == "\t":
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If the text begins with only two "--", possibly preceded by
|
||||||
|
# whitespace, that's an emdash; so wrap.
|
||||||
|
stripped = para.lstrip()
|
||||||
|
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
||||||
|
return False
|
||||||
|
|
||||||
|
# I'm not sure what this is for; I thought it was to detect lists,
|
||||||
|
# but there's a <br>-inside-<span> case in one of the tests that
|
||||||
|
# also depends upon it.
|
||||||
|
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
|
||||||
|
return not wrap_list_items
|
||||||
|
|
||||||
|
# If text contains a pipe character it is likely a table
|
||||||
|
if not wrap_tables and config.RE_TABLE.search(para):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If the text begins with a single -, *, or +, followed by a space,
|
||||||
|
# or an integer, followed by a ., followed by a space (in either
|
||||||
|
# case optionally proceeded by whitespace), it's a list; don't wrap.
|
||||||
|
return bool(
|
||||||
|
config.RE_ORDERED_LIST_MATCHER.match(stripped)
|
||||||
|
or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def escape_md(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Escapes markdown-sensitive characters within other markdown
|
||||||
|
constructs.
|
||||||
|
"""
|
||||||
|
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
|
||||||
|
|
||||||
|
|
||||||
|
def escape_md_section(
|
||||||
|
text: str,
|
||||||
|
escape_backslash: bool = True,
|
||||||
|
snob: bool = False,
|
||||||
|
escape_dot: bool = True,
|
||||||
|
escape_plus: bool = True,
|
||||||
|
escape_dash: bool = True
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Escapes markdown-sensitive characters across whole document sections.
|
||||||
|
Each escaping operation can be controlled individually.
|
||||||
|
"""
|
||||||
|
if escape_backslash:
|
||||||
|
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
|
||||||
|
|
||||||
|
if snob:
|
||||||
|
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
|
||||||
|
|
||||||
|
if escape_dot:
|
||||||
|
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
|
||||||
|
|
||||||
|
if escape_plus:
|
||||||
|
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
|
||||||
|
|
||||||
|
if escape_dash:
|
||||||
|
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def reformat_table(lines: List[str], right_margin: int) -> List[str]:
|
||||||
|
"""
|
||||||
|
Given the lines of a table
|
||||||
|
padds the cells and returns the new lines
|
||||||
|
"""
|
||||||
|
# find the maximum width of the columns
|
||||||
|
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
|
||||||
|
max_cols = len(max_width)
|
||||||
|
for line in lines:
|
||||||
|
cols = [x.rstrip() for x in line.split("|")]
|
||||||
|
num_cols = len(cols)
|
||||||
|
|
||||||
|
# don't drop any data if colspan attributes result in unequal lengths
|
||||||
|
if num_cols < max_cols:
|
||||||
|
cols += [""] * (max_cols - num_cols)
|
||||||
|
elif max_cols < num_cols:
|
||||||
|
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
|
||||||
|
max_cols = num_cols
|
||||||
|
|
||||||
|
max_width = [
|
||||||
|
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
|
||||||
|
]
|
||||||
|
|
||||||
|
# reformat
|
||||||
|
new_lines = []
|
||||||
|
for line in lines:
|
||||||
|
cols = [x.rstrip() for x in line.split("|")]
|
||||||
|
if set(line.strip()) == set("-|"):
|
||||||
|
filler = "-"
|
||||||
|
new_cols = [
|
||||||
|
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||||
|
for x, M in zip(cols, max_width)
|
||||||
|
]
|
||||||
|
new_lines.append("|-" + "|".join(new_cols) + "|")
|
||||||
|
else:
|
||||||
|
filler = " "
|
||||||
|
new_cols = [
|
||||||
|
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||||
|
for x, M in zip(cols, max_width)
|
||||||
|
]
|
||||||
|
new_lines.append("| " + "|".join(new_cols) + "|")
|
||||||
|
return new_lines
|
||||||
|
|
||||||
|
|
||||||
|
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
|
||||||
|
"""
|
||||||
|
Provide padding for tables in the text
|
||||||
|
"""
|
||||||
|
lines = text.split("\n")
|
||||||
|
table_buffer = [] # type: List[str]
|
||||||
|
table_started = False
|
||||||
|
new_lines = []
|
||||||
|
for line in lines:
|
||||||
|
# Toggle table started
|
||||||
|
if config.TABLE_MARKER_FOR_PAD in line:
|
||||||
|
table_started = not table_started
|
||||||
|
if not table_started:
|
||||||
|
table = reformat_table(table_buffer, right_margin)
|
||||||
|
new_lines.extend(table)
|
||||||
|
table_buffer = []
|
||||||
|
new_lines.append("")
|
||||||
|
continue
|
||||||
|
# Process lines
|
||||||
|
if table_started:
|
||||||
|
table_buffer.append(line)
|
||||||
|
else:
|
||||||
|
new_lines.append(line)
|
||||||
|
return "\n".join(new_lines)
|
||||||
@@ -1,13 +1,12 @@
|
|||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||||
import html2text
|
|
||||||
import json
|
import json
|
||||||
import html
|
import html
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
from html2text import HTML2Text
|
from .html2text import HTML2Text
|
||||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||||
from .config import *
|
from .config import *
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -182,10 +181,23 @@ def escape_json_string(s):
|
|||||||
class CustomHTML2Text(HTML2Text):
|
class CustomHTML2Text(HTML2Text):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.ignore_links = True
|
|
||||||
self.inside_pre = False
|
self.inside_pre = False
|
||||||
self.inside_code = False
|
self.inside_code = False
|
||||||
|
|
||||||
|
self.skip_internal_links = False
|
||||||
|
self.single_line_break = False
|
||||||
|
self.mark_code = False
|
||||||
|
self.include_sup_sub = False
|
||||||
|
self.body_width = 0
|
||||||
|
self.ignore_mailto_links = True
|
||||||
|
self.ignore_links = False
|
||||||
|
self.escape_backslash = False
|
||||||
|
self.escape_dot = False
|
||||||
|
self.escape_plus = False
|
||||||
|
self.escape_dash = False
|
||||||
|
self.escape_snob = False
|
||||||
|
|
||||||
|
|
||||||
def handle_tag(self, tag, attrs, start):
|
def handle_tag(self, tag, attrs, start):
|
||||||
if tag == 'pre':
|
if tag == 'pre':
|
||||||
if start:
|
if start:
|
||||||
@@ -194,6 +206,10 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
else:
|
else:
|
||||||
self.o('\n```')
|
self.o('\n```')
|
||||||
self.inside_pre = False
|
self.inside_pre = False
|
||||||
|
elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# elif tag == 'code' and not self.inside_pre:
|
# elif tag == 'code' and not self.inside_pre:
|
||||||
# if start:
|
# if start:
|
||||||
# if not self.inside_pre:
|
# if not self.inside_pre:
|
||||||
|
|||||||
Reference in New Issue
Block a user