refactor(deep-crawling): reorganize deep crawling strategies and add new implementations

Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types. BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed.
2025-02-05 22:50:39 +08:00
parent c308a794e8
commit a9415aaaf6
10 changed files with 769 additions and 214 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1,22 +1,25 @@
-from ast import Call
 import time
 from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
 import json
 import html
+import lxml
 import re
 import os
 import platform
 from .prompts import PROMPT_EXTRACT_BLOCKS
 from array import array
-from .config import *
+from .html2text import html2text, CustomHTML2Text
+# from .config import *
+from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
+import httpx
+from socket import gaierror
 from pathlib import Path
-from typing import Dict, Any, List, Tuple, Union, Optional, Callable
+from typing import Dict, Any, List, Optional, Callable
 from urllib.parse import urljoin
 import requests
 from requests.exceptions import InvalidSchema
-from typing import Dict, Any
 import xxhash
 from colorama import Fore, Style, init
 import textwrap
@@ -24,20 +27,20 @@ import cProfile
 import pstats
 from functools import wraps
 import asyncio
-from lxml import html, etree
+
 import sqlite3
 import hashlib
-from urllib.parse import urljoin, urlparse
+
 from urllib.robotparser import RobotFileParser
 import aiohttp
-from pathlib import Path
+
 from packaging import version
 from . import __version__
-from typing import Sequence, List
-from array import array
+from typing import Sequence
+
 from itertools import chain
 from collections import deque
-from typing import Callable, Generator, Iterable, List, Optional
+from typing import  Generator, Iterable

 def chunk_documents(
    documents: Iterable[str],
@@ -194,7 +197,7 @@ class VersionManager:
            return None
        try:
            return version.parse(self.version_file.read_text().strip())
-        except:
+        except Exception as _ex:
            return None

    def update_version(self):
@@ -286,7 +289,7 @@ class RobotsParser:
            domain = parsed.netloc
            if not domain:
                return True
-        except:
+        except Exception as _ex:
            return True

        # Fast path - check cache first
@@ -306,7 +309,7 @@ class RobotsParser:
                            self._cache_rules(domain, rules)
                        else:
                            return True
-            except:
+            except Exception as _ex:
                # On any error (timeout, connection failed, etc), allow access
                return True

@@ -1374,7 +1377,7 @@ def get_content_of_website_optimized(
            src = img.get("src", "")
            if base64_pattern.match(src):
                img["src"] = base64_pattern.sub("", src)
-        except:
+        except Exception as _ex:
            pass

    cleaned_html = str(body).replace("\n\n", "\n").replace("  ", " ")
@@ -1412,7 +1415,7 @@ def extract_metadata_using_lxml(html, doc=None):

    if doc is None:
        try:
-            doc = lhtml.document_fromstring(html)
+            doc = lxml.html.document_fromstring(html)
        except Exception:
            return {}

@@ -1728,10 +1731,10 @@ def extract_blocks_batch(batch_data, provider="groq/llama3-70b-8192", api_token=

    messages = []

-    for url, html in batch_data:
+    for url, _html in batch_data:
        variable_values = {
            "URL": url,
-            "HTML": html,
+            "HTML": _html,
        }

        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
@@ -1913,7 +1916,7 @@ def fast_format_html(html_string):
    indent = 0
    indent_str = "  "  # Two spaces for indentation
    formatted = []
-    in_content = False
+    # in_content = False

    # Split by < and > to separate tags and content
    parts = html_string.replace(">", ">\n").replace("<", "\n<").split("\n")
@@ -2466,17 +2469,74 @@ def truncate(value, threshold):
 def optimize_html(html_str, threshold=200):
    root = html.fromstring(html_str)
    
-    for element in root.iter():
+    for _element in root.iter():
        # Process attributes
-        for attr in list(element.attrib):
-            element.attrib[attr] = truncate(element.attrib[attr], threshold)
+        for attr in list(_element.attrib):
+            _element.attrib[attr] = truncate(_element.attrib[attr], threshold)
        
        # Process text content
-        if element.text and len(element.text) > threshold:
-            element.text = truncate(element.text, threshold)
+        if _element.text and len(_element.text) > threshold:
+            _element.text = truncate(_element.text, threshold)
            
        # Process tail text
-        if element.tail and len(element.tail) > threshold:
-            element.tail = truncate(element.tail, threshold)
+        if _element.tail and len(_element.tail) > threshold:
+            _element.tail = truncate(_element.tail, threshold)
    
-    return html.tostring(root, encoding='unicode', pretty_print=False)
+    return html.tostring(root, encoding='unicode', pretty_print=False)
+
+class HeadPeekr:
+    @staticmethod
+    async def fetch_head_section(url, timeout=0.3):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; CrawlBot/1.0)",
+            "Accept": "text/html",
+            "Connection": "close"  # Force close after response
+        }
+        try:
+            async with httpx.AsyncClient(timeout=timeout) as client:
+                response = await client.get(url, headers=headers, follow_redirects=True)
+                
+                # Handle redirects explicitly by using the final URL
+                if response.url != url:
+                    url = str(response.url)
+                    response = await client.get(url, headers=headers)
+                
+                content = b""
+                async for chunk in response.aiter_bytes():
+                    content += chunk
+                    if b"</head>" in content:
+                        break  # Stop after detecting </head>
+                return content.split(b"</head>")[0] + b"</head>"
+        except (httpx.HTTPError, gaierror) :
+            return None
+
+    @staticmethod
+    async def peek_html(url, timeout=0.3):
+        head_section = await HeadPeekr.fetch_head_section(url, timeout=timeout)
+        if head_section:
+            return head_section.decode("utf-8", errors="ignore")
+        return None
+
+    @staticmethod
+    def extract_meta_tags(head_content: str):
+        meta_tags = {}
+        
+        # Find all meta tags
+        meta_pattern = r'<meta[^>]+>'
+        for meta_tag in re.finditer(meta_pattern, head_content):
+            tag = meta_tag.group(0)
+            
+            # Extract name/property and content
+            name_match = re.search(r'name=["\'](.*?)["\']', tag)
+            property_match = re.search(r'property=["\'](.*?)["\']', tag)
+            content_match = re.search(r'content=["\'](.*?)["\']', tag)
+            
+            if content_match and (name_match or property_match):
+                key = name_match.group(1) if name_match else property_match.group(1)
+                meta_tags[key] = content_match.group(1)
+                
+        return meta_tags
+
+    def get_title(head_content: str):
+        title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
+        return title_match.group(1) if title_match else None