2025 feb alpha 1 (#685)

* spelling change in prompt

* gpt-4o-mini support

* Remove leading Y before here

* prompt spell correction

* (Docs) Fix numbered list end-of-line formatting

Added the missing "two spaces" to add a line break

* fix: access downloads_path through browser_config in _handle_download method - Fixes #585

* crawl

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/583

* Docs update: https://github.com/unclecode/crawl4ai/issues/649

* fix: https://github.com/unclecode/crawl4ai/issues/570

* Docs: updated example for content-selection to reflect new changes in yc newsfeed css

* Refactor: Removed old filters and replaced with optimised filters

* fix:Fixed imports as per the new names of filters

* Tests: For deep crawl filters

* Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers.

* fix: awaiting on filters that are async in nature eg: content relevance and seo filters

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/715

---------

Co-authored-by: DarshanTank <darshan.tank@gnani.ai>
Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com>
Co-authored-by: Serhat Soydan <ssoydan@gmail.com>
Co-authored-by: cardit1 <maneesh@cardit.in>
Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
This commit is contained in:
Aravind
2025-02-19 11:43:17 +05:30
committed by GitHub
parent c171891999
commit dad592c801
19 changed files with 833 additions and 1350 deletions

View File

@@ -510,6 +510,7 @@ class HTML2Text(html.parser.HTMLParser):
if tag == "a" and not self.ignore_links:
if start:
self.inside_link = True
if (
"href" in attrs
and attrs["href"] is not None
@@ -526,6 +527,7 @@ class HTML2Text(html.parser.HTMLParser):
else:
self.astack.append(None)
else:
self.inside_link = False
if self.astack:
a = self.astack.pop()
if self.maybe_automatic_link and not self.empty_link:
@@ -610,13 +612,22 @@ class HTML2Text(html.parser.HTMLParser):
self.o("[" + str(a_props.count) + "]")
if tag == "dl" and start:
self.p()
if tag == "dt" and not start:
self.pbr()
if tag == "dd" and start:
self.o(" ")
if tag == "dd" and not start:
self.pbr()
self.p() # Add paragraph break before list starts
self.p_p = 0 # Reset paragraph state
elif tag == "dt" and start:
if self.p_p == 0: # If not first term
self.o("\n\n") # Add spacing before new term-definition pair
self.p_p = 0 # Reset paragraph state
elif tag == "dt" and not start:
self.o("\n") # Single newline between term and definition
elif tag == "dd" and start:
self.o(" ") # Indent definition
elif tag == "dd" and not start:
self.p_p = 0
if tag in ["ol", "ul"]:
# Google Docs create sub lists as top level lists
@@ -1026,6 +1037,7 @@ class CustomHTML2Text(HTML2Text):
super().__init__(*args, **kwargs)
self.inside_pre = False
self.inside_code = False
self.inside_link = False
self.preserve_tags = set() # Set of tags to preserve
self.current_preserved_tag = None
self.preserved_content = []
@@ -1105,11 +1117,17 @@ class CustomHTML2Text(HTML2Text):
# Ignore code tags inside pre blocks if handle_code_in_pre is False
return
if start:
self.o("`") # Markdown inline code start
if not self.inside_link:
self.o("`") # Only output backtick if not inside a link
self.inside_code = True
else:
self.o("`") # Markdown inline code end
if not self.inside_link:
self.o("`") # Only output backtick if not inside a link
self.inside_code = False
# If inside a link, let the parent class handle the content
if self.inside_link:
super().handle_tag(tag, attrs, start)
else:
super().handle_tag(tag, attrs, start)