2025 feb alpha 1 (#685)

* spelling change in prompt * gpt-4o-mini support * Remove leading Y before here * prompt spell correction * (Docs) Fix numbered list end-of-line formatting Added the missing "two spaces" to add a line break * fix: access downloads_path through browser_config in _handle_download method - Fixes #585 * crawl * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/583 * Docs update: https://github.com/unclecode/crawl4ai/issues/649 * fix: https://github.com/unclecode/crawl4ai/issues/570 * Docs: updated example for content-selection to reflect new changes in yc newsfeed css * Refactor: Removed old filters and replaced with optimised filters * fix:Fixed imports as per the new names of filters * Tests: For deep crawl filters * Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers. * fix: awaiting on filters that are async in nature eg: content relevance and seo filters * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/715 --------- Co-authored-by: DarshanTank <darshan.tank@gnani.ai> Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com> Co-authored-by: Serhat Soydan <ssoydan@gmail.com> Co-authored-by: cardit1 <maneesh@cardit.in> Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
2025-02-19 11:43:17 +05:30
parent c171891999
commit dad592c801
19 changed files with 833 additions and 1350 deletions
--- a/crawl4ai/html2text/init.py
+++ b/crawl4ai/html2text/init.py
@@ -510,6 +510,7 @@ class HTML2Text(html.parser.HTMLParser):

        if tag == "a" and not self.ignore_links:
            if start:
+                self.inside_link = True
                if (
                    "href" in attrs
                    and attrs["href"] is not None
@@ -526,6 +527,7 @@ class HTML2Text(html.parser.HTMLParser):
                else:
                    self.astack.append(None)
            else:
+                self.inside_link = False
                if self.astack:
                    a = self.astack.pop()
                    if self.maybe_automatic_link and not self.empty_link:
@@ -610,13 +612,22 @@ class HTML2Text(html.parser.HTMLParser):
                        self.o("[" + str(a_props.count) + "]")

        if tag == "dl" and start:
-            self.p()
-        if tag == "dt" and not start:
-            self.pbr()
-        if tag == "dd" and start:
-            self.o("    ")
-        if tag == "dd" and not start:
-            self.pbr()
+            self.p()  # Add paragraph break before list starts
+            self.p_p = 0  # Reset paragraph state
+        
+        elif tag == "dt" and start:
+            if self.p_p == 0:  # If not first term
+                self.o("\n\n")  # Add spacing before new term-definition pair
+            self.p_p = 0  # Reset paragraph state
+        
+        elif tag == "dt" and not start:
+            self.o("\n")  # Single newline between term and definition
+        
+        elif tag == "dd" and start:
+            self.o("    ")  # Indent definition
+        
+        elif tag == "dd" and not start:
+            self.p_p = 0

        if tag in ["ol", "ul"]:
            # Google Docs create sub lists as top level lists
@@ -1026,6 +1037,7 @@ class CustomHTML2Text(HTML2Text):
        super().__init__(*args, **kwargs)
        self.inside_pre = False
        self.inside_code = False
+        self.inside_link = False
        self.preserve_tags = set()  # Set of tags to preserve
        self.current_preserved_tag = None
        self.preserved_content = []
@@ -1105,11 +1117,17 @@ class CustomHTML2Text(HTML2Text):
                # Ignore code tags inside pre blocks if handle_code_in_pre is False
                return
            if start:
-                self.o("`")  # Markdown inline code start
+                if not self.inside_link:
+                    self.o("`")  # Only output backtick if not inside a link
                self.inside_code = True
            else:
-                self.o("`")  # Markdown inline code end
+                if not self.inside_link:
+                    self.o("`")  # Only output backtick if not inside a link
                self.inside_code = False
+
+            # If inside a link, let the parent class handle the content
+            if self.inside_link:
+                super().handle_tag(tag, attrs, start) 
        else:
            super().handle_tag(tag, attrs, start)