Enhance Markdown generation and external content control
- Integrate customized html2text library for flexible Markdown output - Add options to exclude external links and images - Improve content scraping efficiency and error handling - Update AsyncPlaywrightCrawlerStrategy for faster closing - Enhance CosineStrategy with generic embedding model loading
This commit is contained in:
25
CHANGELOG.md
25
CHANGELOG.md
@@ -1,5 +1,30 @@
|
||||
# Changelog
|
||||
|
||||
## [v0.3.72] - 2024-10-20
|
||||
|
||||
### Added
|
||||
- Forked and integrated a customized version of the html2text library for more control over Markdown generation
|
||||
- New configuration options for controlling external content:
|
||||
- Ability to exclude all external links
|
||||
- Option to specify domains to exclude (default includes major social media platforms)
|
||||
- Control over excluding external images
|
||||
|
||||
### Changed
|
||||
- Improved Markdown generation process:
|
||||
- Added fine-grained control over character escaping in Markdown output
|
||||
- Enhanced handling of code blocks and pre-formatted text
|
||||
- Updated `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500)
|
||||
- Enhanced flexibility in `CosineStrategy` with a more generic `load_HF_embedding_model` function
|
||||
|
||||
### Improved
|
||||
- Optimized content scraping and processing for better efficiency
|
||||
- Enhanced error handling and logging in various components
|
||||
|
||||
### Developer Notes
|
||||
- The customized html2text library is now located within the crawl4ai package
|
||||
- New configuration options are available in the `config.py` file for external content handling
|
||||
- The `WebScrappingStrategy` class has been updated to accommodate new external content exclusion options
|
||||
|
||||
## [v0.3.71] - 2024-10-19
|
||||
|
||||
### Added
|
||||
|
||||
@@ -396,6 +396,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
response_headers = {}
|
||||
|
||||
await page.wait_for_selector('body')
|
||||
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
|
||||
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
|
||||
@@ -477,7 +478,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
});
|
||||
|
||||
// Fallback timeout of 5 seconds
|
||||
setTimeout(() => resolve(), 5000);
|
||||
// setTimeout(() => resolve(), 5000);
|
||||
resolve();
|
||||
});
|
||||
}
|
||||
"""
|
||||
|
||||
@@ -186,7 +186,8 @@ class AsyncWebCrawler:
|
||||
try:
|
||||
t1 = time.time()
|
||||
scrapping_strategy = WebScrappingStrategy()
|
||||
result = await scrapping_strategy.ascrap(
|
||||
# result = await scrapping_strategy.ascrap(
|
||||
result = await scrapping_strategy.scrap(
|
||||
url,
|
||||
html,
|
||||
word_count_threshold=word_count_threshold,
|
||||
|
||||
@@ -4,22 +4,21 @@ from dotenv import load_dotenv
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
DEFAULT_PROVIDER = "openai/gpt-4-turbo"
|
||||
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
|
||||
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
PROVIDER_MODELS = {
|
||||
"ollama/llama3": "no-token-needed", # Any model from Ollama no need for API token
|
||||
"groq/llama3-70b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"),
|
||||
"openai/gpt-3.5-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4-turbo": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"),
|
||||
"openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
|
||||
}
|
||||
|
||||
|
||||
# Chunk token threshold
|
||||
CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
|
||||
OVERLAP_RATE = 0.1
|
||||
@@ -29,6 +28,27 @@ WORD_TOKEN_RATE = 1.3
|
||||
MIN_WORD_THRESHOLD = 1
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
|
||||
|
||||
IMPORTANT_ATTRS = ['src', 'href', 'alt', 'title', 'width', 'height']
|
||||
ONLY_TEXT_ELIGIBLE_TAGS = ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']
|
||||
SOCIAL_MEDIA_DOMAINS = [
|
||||
'facebook.com',
|
||||
'twitter.com',
|
||||
'x.com',
|
||||
'linkedin.com',
|
||||
'instagram.com',
|
||||
'pinterest.com',
|
||||
'youtube.com',
|
||||
'tiktok.com',
|
||||
'snapchat.com',
|
||||
'whatsapp.com',
|
||||
'messenger.com',
|
||||
'reddit.com',
|
||||
'tumblr.com',
|
||||
'buffer.com',
|
||||
'xing.com',
|
||||
'flipboard.com',
|
||||
]
|
||||
|
||||
# Threshold for the Image extraction - Range is 1 to 6
|
||||
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
||||
# to each image based on the following aspects.
|
||||
|
||||
@@ -40,6 +40,7 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
body = soup.body
|
||||
|
||||
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
|
||||
for tag in kwargs.get('excluded_tags', []) or []:
|
||||
@@ -150,6 +151,8 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
score+=1
|
||||
return score
|
||||
|
||||
|
||||
|
||||
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||
return None
|
||||
score = score_image_for_usefulness(img, url, index, total_images)
|
||||
@@ -164,6 +167,19 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
'type': 'image'
|
||||
}
|
||||
|
||||
def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False):
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
if keep_data_attributes:
|
||||
if not attr.startswith('data-'):
|
||||
attrs_to_remove.append(attr)
|
||||
else:
|
||||
attrs_to_remove.append(attr)
|
||||
|
||||
for attr in attrs_to_remove:
|
||||
del element[attr]
|
||||
|
||||
def process_element(element: element.PageElement) -> bool:
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
@@ -190,8 +206,39 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
else:
|
||||
links['internal'].append(link_data)
|
||||
keep_element = True
|
||||
|
||||
if kwargs.get('exclude_external_links', True):
|
||||
href_url_base = href.split('/')[2]
|
||||
if url_base not in href_url_base:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# Check if we should esclude links to all major social media platforms
|
||||
if not kwargs.get('exclude_external_links', False) and kwargs.get('exclude_social_media_links', True):
|
||||
social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('social_media_domains', [])
|
||||
social_media_domains = list(set(social_media_domains))
|
||||
if any(domain in href for domain in social_media_domains):
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
elif element.name == 'img':
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get('exclude_external_images', False):
|
||||
src = element.get('src', '')
|
||||
src_url_base = src.split('/')[2]
|
||||
url_base = url.split('/')[2]
|
||||
if url_base not in src_url_base:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', True):
|
||||
src = element.get('src', '')
|
||||
src_url_base = src.split('/')[2]
|
||||
url_base = url.split('/')[2]
|
||||
if any(domain in src for domain in SOCIAL_MEDIA_DOMAINS):
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
return True # Always keep image elements
|
||||
|
||||
elif element.name in ['video', 'audio']:
|
||||
@@ -211,14 +258,17 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
})
|
||||
return True # Always keep video and audio elements
|
||||
|
||||
if element.name != 'pre':
|
||||
if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
|
||||
if kwargs.get('only_text', False):
|
||||
element.replace_with(element.get_text())
|
||||
else:
|
||||
element.unwrap()
|
||||
elif element.name != 'img':
|
||||
element.attrs = {}
|
||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||
if kwargs.get('only_text', False):
|
||||
element.replace_with(element.get_text())
|
||||
|
||||
remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
|
||||
# for attr in element.attrs:
|
||||
# if attr not in IMPORTANT_ATTRS or (attr.startswith('data-') and not kwargs.get('keep_data_attributes', False)):
|
||||
# del element[attr]
|
||||
|
||||
# Print element name and attributes
|
||||
print(element.name, element.attrs)
|
||||
|
||||
# Process children
|
||||
for child in list(element.children):
|
||||
@@ -254,7 +304,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
process_element(body)
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
|
||||
|
||||
|
||||
imgs = body.find_all('img')
|
||||
|
||||
with ThreadPoolExecutor() as executor:
|
||||
image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs)))
|
||||
media['images'] = [result for result in image_results if result is not None]
|
||||
@@ -307,10 +361,9 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
||||
|
||||
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
|
||||
|
||||
h = CustomHTML2Text()
|
||||
h.ignore_links = not kwargs.get('include_links_on_markdown', False)
|
||||
h.body_width = 0
|
||||
try:
|
||||
h = CustomHTML2Text()
|
||||
h.update_params(**kwargs.get('html2text', {}))
|
||||
markdown = h.handle(cleaned_html)
|
||||
except Exception as e:
|
||||
markdown = h.handle(sanitize_html(cleaned_html))
|
||||
|
||||
@@ -68,7 +68,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
super().__init__()
|
||||
self.provider = provider
|
||||
self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY")
|
||||
self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
|
||||
self.instruction = instruction
|
||||
self.extract_type = extraction_type
|
||||
self.schema = schema
|
||||
|
||||
1015
crawl4ai/html2text/__init__.py
Normal file
1015
crawl4ai/html2text/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
3
crawl4ai/html2text/__main__.py
Normal file
3
crawl4ai/html2text/__main__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .cli import main
|
||||
|
||||
main()
|
||||
2
crawl4ai/html2text/_typing.py
Normal file
2
crawl4ai/html2text/_typing.py
Normal file
@@ -0,0 +1,2 @@
|
||||
class OutCallback:
|
||||
def __call__(self, s: str) -> None: ...
|
||||
330
crawl4ai/html2text/cli.py
Normal file
330
crawl4ai/html2text/cli.py
Normal file
@@ -0,0 +1,330 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from . import HTML2Text, __version__, config
|
||||
|
||||
|
||||
def main() -> None:
|
||||
baseurl = ""
|
||||
|
||||
class bcolors:
|
||||
HEADER = "\033[95m"
|
||||
OKBLUE = "\033[94m"
|
||||
OKGREEN = "\033[92m"
|
||||
WARNING = "\033[93m"
|
||||
FAIL = "\033[91m"
|
||||
ENDC = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
UNDERLINE = "\033[4m"
|
||||
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument(
|
||||
"--default-image-alt",
|
||||
dest="default_image_alt",
|
||||
default=config.DEFAULT_IMAGE_ALT,
|
||||
help="The default alt string for images with missing ones",
|
||||
)
|
||||
p.add_argument(
|
||||
"--pad-tables",
|
||||
dest="pad_tables",
|
||||
action="store_true",
|
||||
default=config.PAD_TABLES,
|
||||
help="pad the cells to equal column width in tables",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-wrap-links",
|
||||
dest="wrap_links",
|
||||
action="store_false",
|
||||
default=config.WRAP_LINKS,
|
||||
help="don't wrap links during conversion",
|
||||
)
|
||||
p.add_argument(
|
||||
"--wrap-list-items",
|
||||
dest="wrap_list_items",
|
||||
action="store_true",
|
||||
default=config.WRAP_LIST_ITEMS,
|
||||
help="wrap list items during conversion",
|
||||
)
|
||||
p.add_argument(
|
||||
"--wrap-tables",
|
||||
dest="wrap_tables",
|
||||
action="store_true",
|
||||
default=config.WRAP_TABLES,
|
||||
help="wrap tables",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-emphasis",
|
||||
dest="ignore_emphasis",
|
||||
action="store_true",
|
||||
default=config.IGNORE_EMPHASIS,
|
||||
help="don't include any formatting for emphasis",
|
||||
)
|
||||
p.add_argument(
|
||||
"--reference-links",
|
||||
dest="inline_links",
|
||||
action="store_false",
|
||||
default=config.INLINE_LINKS,
|
||||
help="use reference style links instead of inline links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-links",
|
||||
dest="ignore_links",
|
||||
action="store_true",
|
||||
default=config.IGNORE_ANCHORS,
|
||||
help="don't include any formatting for links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-mailto-links",
|
||||
action="store_true",
|
||||
dest="ignore_mailto_links",
|
||||
default=config.IGNORE_MAILTO_LINKS,
|
||||
help="don't include mailto: links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--protect-links",
|
||||
dest="protect_links",
|
||||
action="store_true",
|
||||
default=config.PROTECT_LINKS,
|
||||
help="protect links from line breaks surrounding them with angle brackets",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-images",
|
||||
dest="ignore_images",
|
||||
action="store_true",
|
||||
default=config.IGNORE_IMAGES,
|
||||
help="don't include any formatting for images",
|
||||
)
|
||||
p.add_argument(
|
||||
"--images-as-html",
|
||||
dest="images_as_html",
|
||||
action="store_true",
|
||||
default=config.IMAGES_AS_HTML,
|
||||
help=(
|
||||
"Always write image tags as raw html; preserves `height`, `width` and "
|
||||
"`alt` if possible."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--images-to-alt",
|
||||
dest="images_to_alt",
|
||||
action="store_true",
|
||||
default=config.IMAGES_TO_ALT,
|
||||
help="Discard image data, only keep alt text",
|
||||
)
|
||||
p.add_argument(
|
||||
"--images-with-size",
|
||||
dest="images_with_size",
|
||||
action="store_true",
|
||||
default=config.IMAGES_WITH_SIZE,
|
||||
help=(
|
||||
"Write image tags with height and width attrs as raw html to retain "
|
||||
"dimensions"
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"-g",
|
||||
"--google-doc",
|
||||
action="store_true",
|
||||
dest="google_doc",
|
||||
default=False,
|
||||
help="convert an html-exported Google Document",
|
||||
)
|
||||
p.add_argument(
|
||||
"-d",
|
||||
"--dash-unordered-list",
|
||||
action="store_true",
|
||||
dest="ul_style_dash",
|
||||
default=False,
|
||||
help="use a dash rather than a star for unordered list items",
|
||||
)
|
||||
p.add_argument(
|
||||
"-e",
|
||||
"--asterisk-emphasis",
|
||||
action="store_true",
|
||||
dest="em_style_asterisk",
|
||||
default=False,
|
||||
help="use an asterisk rather than an underscore for emphasized text",
|
||||
)
|
||||
p.add_argument(
|
||||
"-b",
|
||||
"--body-width",
|
||||
dest="body_width",
|
||||
type=int,
|
||||
default=config.BODY_WIDTH,
|
||||
help="number of characters per output line, 0 for no wrap",
|
||||
)
|
||||
p.add_argument(
|
||||
"-i",
|
||||
"--google-list-indent",
|
||||
dest="list_indent",
|
||||
type=int,
|
||||
default=config.GOOGLE_LIST_INDENT,
|
||||
help="number of pixels Google indents nested lists",
|
||||
)
|
||||
p.add_argument(
|
||||
"-s",
|
||||
"--hide-strikethrough",
|
||||
action="store_true",
|
||||
dest="hide_strikethrough",
|
||||
default=False,
|
||||
help="hide strike-through text. only relevant when -g is " "specified as well",
|
||||
)
|
||||
p.add_argument(
|
||||
"--escape-all",
|
||||
action="store_true",
|
||||
dest="escape_snob",
|
||||
default=False,
|
||||
help=(
|
||||
"Escape all special characters. Output is less readable, but avoids "
|
||||
"corner case formatting issues."
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--bypass-tables",
|
||||
action="store_true",
|
||||
dest="bypass_tables",
|
||||
default=config.BYPASS_TABLES,
|
||||
help="Format tables in HTML rather than Markdown syntax.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--ignore-tables",
|
||||
action="store_true",
|
||||
dest="ignore_tables",
|
||||
default=config.IGNORE_TABLES,
|
||||
help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--single-line-break",
|
||||
action="store_true",
|
||||
dest="single_line_break",
|
||||
default=config.SINGLE_LINE_BREAK,
|
||||
help=(
|
||||
"Use a single line break after a block element rather than two line "
|
||||
"breaks. NOTE: Requires --body-width=0"
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--unicode-snob",
|
||||
action="store_true",
|
||||
dest="unicode_snob",
|
||||
default=config.UNICODE_SNOB,
|
||||
help="Use unicode throughout document",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-automatic-links",
|
||||
action="store_false",
|
||||
dest="use_automatic_links",
|
||||
default=config.USE_AUTOMATIC_LINKS,
|
||||
help="Do not use automatic links wherever applicable",
|
||||
)
|
||||
p.add_argument(
|
||||
"--no-skip-internal-links",
|
||||
action="store_false",
|
||||
dest="skip_internal_links",
|
||||
default=config.SKIP_INTERNAL_LINKS,
|
||||
help="Do not skip internal links",
|
||||
)
|
||||
p.add_argument(
|
||||
"--links-after-para",
|
||||
action="store_true",
|
||||
dest="links_each_paragraph",
|
||||
default=config.LINKS_EACH_PARAGRAPH,
|
||||
help="Put links after each paragraph instead of document",
|
||||
)
|
||||
p.add_argument(
|
||||
"--mark-code",
|
||||
action="store_true",
|
||||
dest="mark_code",
|
||||
default=config.MARK_CODE,
|
||||
help="Mark program code blocks with [code]...[/code]",
|
||||
)
|
||||
p.add_argument(
|
||||
"--decode-errors",
|
||||
dest="decode_errors",
|
||||
default=config.DECODE_ERRORS,
|
||||
help=(
|
||||
"What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
|
||||
"acceptable values"
|
||||
),
|
||||
)
|
||||
p.add_argument(
|
||||
"--open-quote",
|
||||
dest="open_quote",
|
||||
default=config.OPEN_QUOTE,
|
||||
help="The character used to open quotes",
|
||||
)
|
||||
p.add_argument(
|
||||
"--close-quote",
|
||||
dest="close_quote",
|
||||
default=config.CLOSE_QUOTE,
|
||||
help="The character used to close quotes",
|
||||
)
|
||||
p.add_argument(
|
||||
"--version", action="version", version=".".join(map(str, __version__))
|
||||
)
|
||||
p.add_argument("filename", nargs="?")
|
||||
p.add_argument("encoding", nargs="?", default="utf-8")
|
||||
p.add_argument(
|
||||
"--include-sup-sub",
|
||||
dest="include_sup_sub",
|
||||
action="store_true",
|
||||
default=config.INCLUDE_SUP_SUB,
|
||||
help="Include the sup and sub tags",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
if args.filename and args.filename != "-":
|
||||
with open(args.filename, "rb") as fp:
|
||||
data = fp.read()
|
||||
else:
|
||||
data = sys.stdin.buffer.read()
|
||||
|
||||
try:
|
||||
html = data.decode(args.encoding, args.decode_errors)
|
||||
except UnicodeDecodeError as err:
|
||||
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
|
||||
warning += " Use the " + bcolors.OKGREEN
|
||||
warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
|
||||
print(warning)
|
||||
raise err
|
||||
|
||||
h = HTML2Text(baseurl=baseurl)
|
||||
# handle options
|
||||
if args.ul_style_dash:
|
||||
h.ul_item_mark = "-"
|
||||
if args.em_style_asterisk:
|
||||
h.emphasis_mark = "*"
|
||||
h.strong_mark = "__"
|
||||
|
||||
h.body_width = args.body_width
|
||||
h.google_list_indent = args.list_indent
|
||||
h.ignore_emphasis = args.ignore_emphasis
|
||||
h.ignore_links = args.ignore_links
|
||||
h.ignore_mailto_links = args.ignore_mailto_links
|
||||
h.protect_links = args.protect_links
|
||||
h.ignore_images = args.ignore_images
|
||||
h.images_as_html = args.images_as_html
|
||||
h.images_to_alt = args.images_to_alt
|
||||
h.images_with_size = args.images_with_size
|
||||
h.google_doc = args.google_doc
|
||||
h.hide_strikethrough = args.hide_strikethrough
|
||||
h.escape_snob = args.escape_snob
|
||||
h.bypass_tables = args.bypass_tables
|
||||
h.ignore_tables = args.ignore_tables
|
||||
h.single_line_break = args.single_line_break
|
||||
h.inline_links = args.inline_links
|
||||
h.unicode_snob = args.unicode_snob
|
||||
h.use_automatic_links = args.use_automatic_links
|
||||
h.skip_internal_links = args.skip_internal_links
|
||||
h.links_each_paragraph = args.links_each_paragraph
|
||||
h.mark_code = args.mark_code
|
||||
h.wrap_links = args.wrap_links
|
||||
h.wrap_list_items = args.wrap_list_items
|
||||
h.wrap_tables = args.wrap_tables
|
||||
h.pad_tables = args.pad_tables
|
||||
h.default_image_alt = args.default_image_alt
|
||||
h.open_quote = args.open_quote
|
||||
h.close_quote = args.close_quote
|
||||
h.include_sup_sub = args.include_sup_sub
|
||||
|
||||
sys.stdout.write(h.handle(html))
|
||||
172
crawl4ai/html2text/config.py
Normal file
172
crawl4ai/html2text/config.py
Normal file
@@ -0,0 +1,172 @@
|
||||
import re
|
||||
|
||||
# Use Unicode characters instead of their ascii pseudo-replacements
|
||||
UNICODE_SNOB = False
|
||||
|
||||
# Marker to use for marking tables for padding post processing
|
||||
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
||||
# Escape all special characters. Output is less readable, but avoids
|
||||
# corner case formatting issues.
|
||||
ESCAPE_SNOB = False
|
||||
ESCAPE_BACKSLASH = False
|
||||
ESCAPE_DOT = False
|
||||
ESCAPE_PLUS = False
|
||||
ESCAPE_DASH = False
|
||||
|
||||
# Put the links after each paragraph instead of at the end.
|
||||
LINKS_EACH_PARAGRAPH = False
|
||||
|
||||
# Wrap long lines at position. 0 for no wrapping.
|
||||
BODY_WIDTH = 78
|
||||
|
||||
# Don't show internal links (href="#local-anchor") -- corresponding link
|
||||
# targets won't be visible in the plain text file anyway.
|
||||
SKIP_INTERNAL_LINKS = True
|
||||
|
||||
# Use inline, rather than reference, formatting for images and links
|
||||
INLINE_LINKS = True
|
||||
|
||||
# Protect links from line breaks surrounding them with angle brackets (in
|
||||
# addition to their square brackets)
|
||||
PROTECT_LINKS = False
|
||||
# WRAP_LINKS = True
|
||||
WRAP_LINKS = True
|
||||
|
||||
# Wrap list items.
|
||||
WRAP_LIST_ITEMS = False
|
||||
|
||||
# Wrap tables
|
||||
WRAP_TABLES = False
|
||||
|
||||
# Number of pixels Google indents nested lists
|
||||
GOOGLE_LIST_INDENT = 36
|
||||
|
||||
# Values Google and others may use to indicate bold text
|
||||
BOLD_TEXT_STYLE_VALUES = ("bold", "700", "800", "900")
|
||||
|
||||
IGNORE_ANCHORS = False
|
||||
IGNORE_MAILTO_LINKS = False
|
||||
IGNORE_IMAGES = False
|
||||
IMAGES_AS_HTML = False
|
||||
IMAGES_TO_ALT = False
|
||||
IMAGES_WITH_SIZE = False
|
||||
IGNORE_EMPHASIS = False
|
||||
MARK_CODE = False
|
||||
DECODE_ERRORS = "strict"
|
||||
DEFAULT_IMAGE_ALT = ""
|
||||
PAD_TABLES = False
|
||||
|
||||
# Convert links with same href and text to <href> format
|
||||
# if they are absolute links
|
||||
USE_AUTOMATIC_LINKS = True
|
||||
|
||||
# For checking space-only lines on line 771
|
||||
RE_SPACE = re.compile(r"\s\+")
|
||||
|
||||
RE_ORDERED_LIST_MATCHER = re.compile(r"\d+\.\s")
|
||||
RE_UNORDERED_LIST_MATCHER = re.compile(r"[-\*\+]\s")
|
||||
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
|
||||
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
||||
|
||||
# to find links in the text
|
||||
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
|
||||
|
||||
# to find table separators
|
||||
RE_TABLE = re.compile(r" \| ")
|
||||
|
||||
RE_MD_DOT_MATCHER = re.compile(
|
||||
r"""
|
||||
^ # start of line
|
||||
(\s*\d+) # optional whitespace and a number
|
||||
(\.) # dot
|
||||
(?=\s) # lookahead assert whitespace
|
||||
""",
|
||||
re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
RE_MD_PLUS_MATCHER = re.compile(
|
||||
r"""
|
||||
^
|
||||
(\s*)
|
||||
(\+)
|
||||
(?=\s)
|
||||
""",
|
||||
flags=re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
RE_MD_DASH_MATCHER = re.compile(
|
||||
r"""
|
||||
^
|
||||
(\s*)
|
||||
(-)
|
||||
(?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
|
||||
# or another dash (header or hr)
|
||||
""",
|
||||
flags=re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
RE_SLASH_CHARS = r"\`*_{}[]()#+-.!"
|
||||
RE_MD_BACKSLASH_MATCHER = re.compile(
|
||||
r"""
|
||||
(\\) # match one slash
|
||||
(?=[%s]) # followed by a char that requires escaping
|
||||
"""
|
||||
% re.escape(RE_SLASH_CHARS),
|
||||
flags=re.VERBOSE,
|
||||
)
|
||||
|
||||
UNIFIABLE = {
|
||||
"rsquo": "'",
|
||||
"lsquo": "'",
|
||||
"rdquo": '"',
|
||||
"ldquo": '"',
|
||||
"copy": "(C)",
|
||||
"mdash": "--",
|
||||
"nbsp": " ",
|
||||
"rarr": "->",
|
||||
"larr": "<-",
|
||||
"middot": "*",
|
||||
"ndash": "-",
|
||||
"oelig": "oe",
|
||||
"aelig": "ae",
|
||||
"agrave": "a",
|
||||
"aacute": "a",
|
||||
"acirc": "a",
|
||||
"atilde": "a",
|
||||
"auml": "a",
|
||||
"aring": "a",
|
||||
"egrave": "e",
|
||||
"eacute": "e",
|
||||
"ecirc": "e",
|
||||
"euml": "e",
|
||||
"igrave": "i",
|
||||
"iacute": "i",
|
||||
"icirc": "i",
|
||||
"iuml": "i",
|
||||
"ograve": "o",
|
||||
"oacute": "o",
|
||||
"ocirc": "o",
|
||||
"otilde": "o",
|
||||
"ouml": "o",
|
||||
"ugrave": "u",
|
||||
"uacute": "u",
|
||||
"ucirc": "u",
|
||||
"uuml": "u",
|
||||
"lrm": "",
|
||||
"rlm": "",
|
||||
}
|
||||
|
||||
# Format tables in HTML rather than Markdown syntax
|
||||
BYPASS_TABLES = False
|
||||
# Ignore table-related tags (table, th, td, tr) while keeping rows
|
||||
IGNORE_TABLES = False
|
||||
|
||||
|
||||
# Use a single line break after a block element rather than two line breaks.
|
||||
# NOTE: Requires body width setting to be 0.
|
||||
SINGLE_LINE_BREAK = False
|
||||
|
||||
|
||||
# Use double quotation marks when converting the <q> tag.
|
||||
OPEN_QUOTE = '"'
|
||||
CLOSE_QUOTE = '"'
|
||||
|
||||
# Include the <sup> and <sub> tags
|
||||
INCLUDE_SUP_SUB = False
|
||||
18
crawl4ai/html2text/elements.py
Normal file
18
crawl4ai/html2text/elements.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
class AnchorElement:
|
||||
__slots__ = ["attrs", "count", "outcount"]
|
||||
|
||||
def __init__(self, attrs: Dict[str, Optional[str]], count: int, outcount: int):
|
||||
self.attrs = attrs
|
||||
self.count = count
|
||||
self.outcount = outcount
|
||||
|
||||
|
||||
class ListElement:
|
||||
__slots__ = ["name", "num"]
|
||||
|
||||
def __init__(self, name: str, num: int):
|
||||
self.name = name
|
||||
self.num = num
|
||||
303
crawl4ai/html2text/utils.py
Normal file
303
crawl4ai/html2text/utils.py
Normal file
@@ -0,0 +1,303 @@
|
||||
import html.entities
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from . import config
|
||||
|
||||
unifiable_n = {
|
||||
html.entities.name2codepoint[k]: v
|
||||
for k, v in config.UNIFIABLE.items()
|
||||
if k != "nbsp"
|
||||
}
|
||||
|
||||
|
||||
def hn(tag: str) -> int:
|
||||
if tag[0] == "h" and len(tag) == 2:
|
||||
n = tag[1]
|
||||
if "0" < n <= "9":
|
||||
return int(n)
|
||||
return 0
|
||||
|
||||
|
||||
def dumb_property_dict(style: str) -> Dict[str, str]:
|
||||
"""
|
||||
:returns: A hash of css attributes
|
||||
"""
|
||||
return {
|
||||
x.strip().lower(): y.strip().lower()
|
||||
for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
|
||||
}
|
||||
|
||||
|
||||
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
|
||||
"""
|
||||
:type data: str
|
||||
|
||||
:returns: A hash of css selectors, each of which contains a hash of
|
||||
css attributes.
|
||||
:rtype: dict
|
||||
"""
|
||||
# remove @import sentences
|
||||
data += ";"
|
||||
importIndex = data.find("@import")
|
||||
while importIndex != -1:
|
||||
data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
|
||||
importIndex = data.find("@import")
|
||||
|
||||
# parse the css. reverted from dictionary comprehension in order to
|
||||
# support older pythons
|
||||
pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
|
||||
try:
|
||||
elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
|
||||
except ValueError:
|
||||
elements = {} # not that important
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
def element_style(
|
||||
attrs: Dict[str, Optional[str]],
|
||||
style_def: Dict[str, Dict[str, str]],
|
||||
parent_style: Dict[str, str],
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
:type attrs: dict
|
||||
:type style_def: dict
|
||||
:type style_def: dict
|
||||
|
||||
:returns: A hash of the 'final' style attributes of the element
|
||||
:rtype: dict
|
||||
"""
|
||||
style = parent_style.copy()
|
||||
if "class" in attrs:
|
||||
assert attrs["class"] is not None
|
||||
for css_class in attrs["class"].split():
|
||||
css_style = style_def.get("." + css_class, {})
|
||||
style.update(css_style)
|
||||
if "style" in attrs:
|
||||
assert attrs["style"] is not None
|
||||
immediate_style = dumb_property_dict(attrs["style"])
|
||||
style.update(immediate_style)
|
||||
|
||||
return style
|
||||
|
||||
|
||||
def google_list_style(style: Dict[str, str]) -> str:
|
||||
"""
|
||||
Finds out whether this is an ordered or unordered list
|
||||
|
||||
:type style: dict
|
||||
|
||||
:rtype: str
|
||||
"""
|
||||
if "list-style-type" in style:
|
||||
list_style = style["list-style-type"]
|
||||
if list_style in ["disc", "circle", "square", "none"]:
|
||||
return "ul"
|
||||
|
||||
return "ol"
|
||||
|
||||
|
||||
def google_has_height(style: Dict[str, str]) -> bool:
|
||||
"""
|
||||
Check if the style of the element has the 'height' attribute
|
||||
explicitly defined
|
||||
|
||||
:type style: dict
|
||||
|
||||
:rtype: bool
|
||||
"""
|
||||
return "height" in style
|
||||
|
||||
|
||||
def google_text_emphasis(style: Dict[str, str]) -> List[str]:
|
||||
"""
|
||||
:type style: dict
|
||||
|
||||
:returns: A list of all emphasis modifiers of the element
|
||||
:rtype: list
|
||||
"""
|
||||
emphasis = []
|
||||
if "text-decoration" in style:
|
||||
emphasis.append(style["text-decoration"])
|
||||
if "font-style" in style:
|
||||
emphasis.append(style["font-style"])
|
||||
if "font-weight" in style:
|
||||
emphasis.append(style["font-weight"])
|
||||
|
||||
return emphasis
|
||||
|
||||
|
||||
def google_fixed_width_font(style: Dict[str, str]) -> bool:
|
||||
"""
|
||||
Check if the css of the current element defines a fixed width font
|
||||
|
||||
:type style: dict
|
||||
|
||||
:rtype: bool
|
||||
"""
|
||||
font_family = ""
|
||||
if "font-family" in style:
|
||||
font_family = style["font-family"]
|
||||
return "courier new" == font_family or "consolas" == font_family
|
||||
|
||||
|
||||
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
|
||||
"""
|
||||
Extract numbering from list element attributes
|
||||
|
||||
:type attrs: dict
|
||||
|
||||
:rtype: int or None
|
||||
"""
|
||||
if "start" in attrs:
|
||||
assert attrs["start"] is not None
|
||||
try:
|
||||
return int(attrs["start"]) - 1
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def skipwrap(
|
||||
para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
|
||||
) -> bool:
|
||||
# If it appears to contain a link
|
||||
# don't wrap
|
||||
if not wrap_links and config.RE_LINK.search(para):
|
||||
return True
|
||||
# If the text begins with four spaces or one tab, it's a code block;
|
||||
# don't wrap
|
||||
if para[0:4] == " " or para[0] == "\t":
|
||||
return True
|
||||
|
||||
# If the text begins with only two "--", possibly preceded by
|
||||
# whitespace, that's an emdash; so wrap.
|
||||
stripped = para.lstrip()
|
||||
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
||||
return False
|
||||
|
||||
# I'm not sure what this is for; I thought it was to detect lists,
|
||||
# but there's a <br>-inside-<span> case in one of the tests that
|
||||
# also depends upon it.
|
||||
if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
|
||||
return not wrap_list_items
|
||||
|
||||
# If text contains a pipe character it is likely a table
|
||||
if not wrap_tables and config.RE_TABLE.search(para):
|
||||
return True
|
||||
|
||||
# If the text begins with a single -, *, or +, followed by a space,
|
||||
# or an integer, followed by a ., followed by a space (in either
|
||||
# case optionally proceeded by whitespace), it's a list; don't wrap.
|
||||
return bool(
|
||||
config.RE_ORDERED_LIST_MATCHER.match(stripped)
|
||||
or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
|
||||
)
|
||||
|
||||
|
||||
def escape_md(text: str) -> str:
|
||||
"""
|
||||
Escapes markdown-sensitive characters within other markdown
|
||||
constructs.
|
||||
"""
|
||||
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
|
||||
|
||||
|
||||
def escape_md_section(
|
||||
text: str,
|
||||
escape_backslash: bool = True,
|
||||
snob: bool = False,
|
||||
escape_dot: bool = True,
|
||||
escape_plus: bool = True,
|
||||
escape_dash: bool = True
|
||||
) -> str:
|
||||
"""
|
||||
Escapes markdown-sensitive characters across whole document sections.
|
||||
Each escaping operation can be controlled individually.
|
||||
"""
|
||||
if escape_backslash:
|
||||
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
|
||||
|
||||
if snob:
|
||||
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
|
||||
|
||||
if escape_dot:
|
||||
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
|
||||
|
||||
if escape_plus:
|
||||
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
|
||||
|
||||
if escape_dash:
|
||||
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
||||
|
||||
return text
|
||||
|
||||
def reformat_table(lines: List[str], right_margin: int) -> List[str]:
|
||||
"""
|
||||
Given the lines of a table
|
||||
padds the cells and returns the new lines
|
||||
"""
|
||||
# find the maximum width of the columns
|
||||
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
|
||||
max_cols = len(max_width)
|
||||
for line in lines:
|
||||
cols = [x.rstrip() for x in line.split("|")]
|
||||
num_cols = len(cols)
|
||||
|
||||
# don't drop any data if colspan attributes result in unequal lengths
|
||||
if num_cols < max_cols:
|
||||
cols += [""] * (max_cols - num_cols)
|
||||
elif max_cols < num_cols:
|
||||
max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
|
||||
max_cols = num_cols
|
||||
|
||||
max_width = [
|
||||
max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
|
||||
]
|
||||
|
||||
# reformat
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
cols = [x.rstrip() for x in line.split("|")]
|
||||
if set(line.strip()) == set("-|"):
|
||||
filler = "-"
|
||||
new_cols = [
|
||||
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||
for x, M in zip(cols, max_width)
|
||||
]
|
||||
new_lines.append("|-" + "|".join(new_cols) + "|")
|
||||
else:
|
||||
filler = " "
|
||||
new_cols = [
|
||||
x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||
for x, M in zip(cols, max_width)
|
||||
]
|
||||
new_lines.append("| " + "|".join(new_cols) + "|")
|
||||
return new_lines
|
||||
|
||||
|
||||
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
|
||||
"""
|
||||
Provide padding for tables in the text
|
||||
"""
|
||||
lines = text.split("\n")
|
||||
table_buffer = [] # type: List[str]
|
||||
table_started = False
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
# Toggle table started
|
||||
if config.TABLE_MARKER_FOR_PAD in line:
|
||||
table_started = not table_started
|
||||
if not table_started:
|
||||
table = reformat_table(table_buffer, right_margin)
|
||||
new_lines.extend(table)
|
||||
table_buffer = []
|
||||
new_lines.append("")
|
||||
continue
|
||||
# Process lines
|
||||
if table_started:
|
||||
table_buffer.append(line)
|
||||
else:
|
||||
new_lines.append(line)
|
||||
return "\n".join(new_lines)
|
||||
@@ -1,13 +1,12 @@
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||
import html2text
|
||||
import json
|
||||
import html
|
||||
import re
|
||||
import os
|
||||
import platform
|
||||
from html2text import HTML2Text
|
||||
from .html2text import HTML2Text
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS
|
||||
from .config import *
|
||||
from pathlib import Path
|
||||
@@ -182,9 +181,22 @@ def escape_json_string(s):
|
||||
class CustomHTML2Text(HTML2Text):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.ignore_links = True
|
||||
self.inside_pre = False
|
||||
self.inside_code = False
|
||||
|
||||
self.skip_internal_links = False
|
||||
self.single_line_break = False
|
||||
self.mark_code = False
|
||||
self.include_sup_sub = False
|
||||
self.body_width = 0
|
||||
self.ignore_mailto_links = True
|
||||
self.ignore_links = False
|
||||
self.escape_backslash = False
|
||||
self.escape_dot = False
|
||||
self.escape_plus = False
|
||||
self.escape_dash = False
|
||||
self.escape_snob = False
|
||||
|
||||
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
if tag == 'pre':
|
||||
@@ -194,6 +206,10 @@ class CustomHTML2Text(HTML2Text):
|
||||
else:
|
||||
self.o('\n```')
|
||||
self.inside_pre = False
|
||||
elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||
pass
|
||||
|
||||
|
||||
# elif tag == 'code' and not self.inside_pre:
|
||||
# if start:
|
||||
# if not self.inside_pre:
|
||||
|
||||
Reference in New Issue
Block a user