refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure: - Moved legacy synchronous crawler code to legacy folder - Removed deprecated CLI and docs manager - Consolidated version manager into utils.py - Added CrawlerHub to __init__.py exports - Fixed type hints in async_webcrawler.py - Fixed minor bugs in chunking and crawler strategies BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions
--- a/crawl4ai/legacy/init.py
+++ b/crawl4ai/legacy/init.py
--- a/crawl4ai/legacy/cli.py
+++ b/crawl4ai/legacy/cli.py
@@ -0,0 +1,123 @@
+import click
+import sys
+import asyncio
+from typing import List
+from .docs_manager import DocsManager
+from .async_logger import AsyncLogger
+
+logger = AsyncLogger(verbose=True)
+docs_manager = DocsManager(logger)
+
+
+def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
+    """Print formatted table with headers and rows"""
+    widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
+    border = "+" + "+".join("-" * (w + 2 * padding) for w in widths) + "+"
+
+    def format_row(row):
+        return (
+            "|"
+            + "|".join(
+                f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
+                for cell, w in zip(row, widths)
+            )
+            + "|"
+        )
+
+    click.echo(border)
+    click.echo(format_row(headers))
+    click.echo(border)
+    for row in rows:
+        click.echo(format_row(row))
+    click.echo(border)
+
+
+@click.group()
+def cli():
+    """Crawl4AI Command Line Interface"""
+    pass
+
+
+@cli.group()
+def docs():
+    """Documentation operations"""
+    pass
+
+
+@docs.command()
+@click.argument("sections", nargs=-1)
+@click.option(
+    "--mode", type=click.Choice(["extended", "condensed"]), default="extended"
+)
+def combine(sections: tuple, mode: str):
+    """Combine documentation sections"""
+    try:
+        asyncio.run(docs_manager.ensure_docs_exist())
+        click.echo(docs_manager.generate(sections, mode))
+    except Exception as e:
+        logger.error(str(e), tag="ERROR")
+        sys.exit(1)
+
+
+@docs.command()
+@click.argument("query")
+@click.option("--top-k", "-k", default=5)
+@click.option("--build-index", is_flag=True, help="Build index if missing")
+def search(query: str, top_k: int, build_index: bool):
+    """Search documentation"""
+    try:
+        result = docs_manager.search(query, top_k)
+        if result == "No search index available. Call build_search_index() first.":
+            if build_index or click.confirm("No search index found. Build it now?"):
+                asyncio.run(docs_manager.llm_text.generate_index_files())
+                result = docs_manager.search(query, top_k)
+        click.echo(result)
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+@docs.command()
+def update():
+    """Update docs from GitHub"""
+    try:
+        asyncio.run(docs_manager.fetch_docs())
+        click.echo("Documentation updated successfully")
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+@docs.command()
+@click.option("--force-facts", is_flag=True, help="Force regenerate fact files")
+@click.option("--clear-cache", is_flag=True, help="Clear BM25 cache")
+def index(force_facts: bool, clear_cache: bool):
+    """Build or rebuild search indexes"""
+    try:
+        asyncio.run(docs_manager.ensure_docs_exist())
+        asyncio.run(
+            docs_manager.llm_text.generate_index_files(
+                force_generate_facts=force_facts, clear_bm25_cache=clear_cache
+            )
+        )
+        click.echo("Search indexes built successfully")
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+# Add docs list command
+@docs.command()
+def list():
+    """List available documentation sections"""
+    try:
+        sections = docs_manager.list()
+        print_table(["Sections"], [[section] for section in sections])
+
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    cli()
--- a/crawl4ai/legacy/crawler_strategy.py
+++ b/crawl4ai/legacy/crawler_strategy.py
@@ -0,0 +1,394 @@
+from abc import ABC, abstractmethod
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import InvalidArgumentException, WebDriverException
+# from selenium.webdriver.chrome.service import Service as ChromeService
+# from webdriver_manager.chrome import ChromeDriverManager
+# from urllib3.exceptions import MaxRetryError
+
+from .config import *
+import logging, time
+import base64
+from PIL import Image, ImageDraw, ImageFont
+from io import BytesIO
+from typing import Callable
+import requests
+import os
+from pathlib import Path
+from .utils import *
+
+logger = logging.getLogger("selenium.webdriver.remote.remote_connection")
+logger.setLevel(logging.WARNING)
+
+logger_driver = logging.getLogger("selenium.webdriver.common.service")
+logger_driver.setLevel(logging.WARNING)
+
+urllib3_logger = logging.getLogger("urllib3.connectionpool")
+urllib3_logger.setLevel(logging.WARNING)
+
+# Disable http.client logging
+http_client_logger = logging.getLogger("http.client")
+http_client_logger.setLevel(logging.WARNING)
+
+# Disable driver_finder and service logging
+driver_finder_logger = logging.getLogger("selenium.webdriver.common.driver_finder")
+driver_finder_logger.setLevel(logging.WARNING)
+
+
+class CrawlerStrategy(ABC):
+    @abstractmethod
+    def crawl(self, url: str, **kwargs) -> str:
+        pass
+
+    @abstractmethod
+    def take_screenshot(self, save_path: str):
+        pass
+
+    @abstractmethod
+    def update_user_agent(self, user_agent: str):
+        pass
+
+    @abstractmethod
+    def set_hook(self, hook_type: str, hook: Callable):
+        pass
+
+
+class CloudCrawlerStrategy(CrawlerStrategy):
+    def __init__(self, use_cached_html=False):
+        super().__init__()
+        self.use_cached_html = use_cached_html
+
+    def crawl(self, url: str) -> str:
+        data = {
+            "urls": [url],
+            "include_raw_html": True,
+            "forced": True,
+            "extract_blocks": False,
+        }
+
+        response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
+        response = response.json()
+        html = response["results"][0]["html"]
+        return sanitize_input_encode(html)
+
+
+class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
+    def __init__(self, use_cached_html=False, js_code=None, **kwargs):
+        super().__init__()
+        print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
+        self.options = Options()
+        self.options.headless = True
+        if kwargs.get("proxy"):
+            self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
+        if kwargs.get("user_agent"):
+            self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
+        else:
+            user_agent = kwargs.get(
+                "user_agent",
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            )
+            self.options.add_argument(f"--user-agent={user_agent}")
+            self.options.add_argument(
+                "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+            )
+
+        self.options.headless = kwargs.get("headless", True)
+        if self.options.headless:
+            self.options.add_argument("--headless")
+
+        self.options.add_argument("--disable-gpu")
+        self.options.add_argument("--window-size=1920,1080")
+        self.options.add_argument("--no-sandbox")
+        self.options.add_argument("--disable-dev-shm-usage")
+        self.options.add_argument("--disable-blink-features=AutomationControlled")
+
+        # self.options.add_argument("--disable-dev-shm-usage")
+        self.options.add_argument("--disable-gpu")
+        # self.options.add_argument("--disable-extensions")
+        # self.options.add_argument("--disable-infobars")
+        # self.options.add_argument("--disable-logging")
+        # self.options.add_argument("--disable-popup-blocking")
+        # self.options.add_argument("--disable-translate")
+        # self.options.add_argument("--disable-default-apps")
+        # self.options.add_argument("--disable-background-networking")
+        # self.options.add_argument("--disable-sync")
+        # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
+        # self.options.add_argument("--disable-browser-side-navigation")
+        # self.options.add_argument("--dns-prefetch-disable")
+        # self.options.add_argument("--disable-web-security")
+        self.options.add_argument("--log-level=3")
+        self.use_cached_html = use_cached_html
+        self.use_cached_html = use_cached_html
+        self.js_code = js_code
+        self.verbose = kwargs.get("verbose", False)
+
+        # Hooks
+        self.hooks = {
+            "on_driver_created": None,
+            "on_user_agent_updated": None,
+            "before_get_url": None,
+            "after_get_url": None,
+            "before_return_html": None,
+        }
+
+        # chromedriver_autoinstaller.install()
+        # import chromedriver_autoinstaller
+        # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+        # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
+        # chromedriver_path = chromedriver_autoinstaller.install()
+        # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
+        # self.service = Service(chromedriver_autoinstaller.install())
+
+        # chromedriver_path = ChromeDriverManager().install()
+        # self.service = Service(chromedriver_path)
+        # self.service.log_path = "NUL"
+        # self.driver = webdriver.Chrome(service=self.service, options=self.options)
+
+        # Use selenium-manager (built into Selenium 4.10.0+)
+        self.service = Service()
+        self.driver = webdriver.Chrome(options=self.options)
+
+        self.driver = self.execute_hook("on_driver_created", self.driver)
+
+        if kwargs.get("cookies"):
+            for cookie in kwargs.get("cookies"):
+                self.driver.add_cookie(cookie)
+
+    def set_hook(self, hook_type: str, hook: Callable):
+        if hook_type in self.hooks:
+            self.hooks[hook_type] = hook
+        else:
+            raise ValueError(f"Invalid hook type: {hook_type}")
+
+    def execute_hook(self, hook_type: str, *args):
+        hook = self.hooks.get(hook_type)
+        if hook:
+            result = hook(*args)
+            if result is not None:
+                if isinstance(result, webdriver.Chrome):
+                    return result
+                else:
+                    raise TypeError(
+                        f"Hook {hook_type} must return an instance of webdriver.Chrome or None."
+                    )
+        # If the hook returns None or there is no hook, return self.driver
+        return self.driver
+
+    def update_user_agent(self, user_agent: str):
+        self.options.add_argument(f"user-agent={user_agent}")
+        self.driver.quit()
+        self.driver = webdriver.Chrome(service=self.service, options=self.options)
+        self.driver = self.execute_hook("on_user_agent_updated", self.driver)
+
+    def set_custom_headers(self, headers: dict):
+        # Enable Network domain for sending headers
+        self.driver.execute_cdp_cmd("Network.enable", {})
+        # Set extra HTTP headers
+        self.driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": headers})
+
+    def _ensure_page_load(self, max_checks=6, check_interval=0.01):
+        initial_length = len(self.driver.page_source)
+
+        for ix in range(max_checks):
+            # print(f"Checking page load: {ix}")
+            time.sleep(check_interval)
+            current_length = len(self.driver.page_source)
+
+            if current_length != initial_length:
+                break
+
+        return self.driver.page_source
+
+    def crawl(self, url: str, **kwargs) -> str:
+        # Create md5 hash of the URL
+        import hashlib
+
+        url_hash = hashlib.md5(url.encode()).hexdigest()
+
+        if self.use_cached_html:
+            cache_file_path = os.path.join(
+                os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()),
+                ".crawl4ai",
+                "cache",
+                url_hash,
+            )
+            if os.path.exists(cache_file_path):
+                with open(cache_file_path, "r") as f:
+                    return sanitize_input_encode(f.read())
+
+        try:
+            self.driver = self.execute_hook("before_get_url", self.driver)
+            if self.verbose:
+                print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
+            self.driver.get(url)  # <html><head></head><body></body></html>
+
+            WebDriverWait(self.driver, 20).until(
+                lambda d: d.execute_script("return document.readyState") == "complete"
+            )
+            WebDriverWait(self.driver, 10).until(
+                EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
+            )
+
+            self.driver.execute_script(
+                "window.scrollTo(0, document.body.scrollHeight);"
+            )
+
+            self.driver = self.execute_hook("after_get_url", self.driver)
+            html = sanitize_input_encode(
+                self._ensure_page_load()
+            )  # self.driver.page_source
+            can_not_be_done_headless = (
+                False  # Look at my creativity for naming variables
+            )
+
+            # TODO: Very ugly approach, but promise to change it!
+            if (
+                kwargs.get("bypass_headless", False)
+                or html == "<html><head></head><body></body></html>"
+            ):
+                print(
+                    "[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode..."
+                )
+                can_not_be_done_headless = True
+                options = Options()
+                options.headless = False
+                # set window size very small
+                options.add_argument("--window-size=5,5")
+                driver = webdriver.Chrome(service=self.service, options=options)
+                driver.get(url)
+                self.driver = self.execute_hook("after_get_url", driver)
+                html = sanitize_input_encode(driver.page_source)
+                driver.quit()
+
+            # Execute JS code if provided
+            self.js_code = kwargs.get("js_code", self.js_code)
+            if self.js_code and type(self.js_code) == str:
+                self.driver.execute_script(self.js_code)
+                # Optionally, wait for some condition after executing the JS code
+                WebDriverWait(self.driver, 10).until(
+                    lambda driver: driver.execute_script("return document.readyState")
+                    == "complete"
+                )
+            elif self.js_code and type(self.js_code) == list:
+                for js in self.js_code:
+                    self.driver.execute_script(js)
+                    WebDriverWait(self.driver, 10).until(
+                        lambda driver: driver.execute_script(
+                            "return document.readyState"
+                        )
+                        == "complete"
+                    )
+
+            # Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky)
+            wait_for = kwargs.get("wait_for", False)
+            if wait_for:
+                if callable(wait_for):
+                    print("[LOG] 🔄 Waiting for condition...")
+                    WebDriverWait(self.driver, 20).until(wait_for)
+                else:
+                    print("[LOG] 🔄 Waiting for condition...")
+                    WebDriverWait(self.driver, 20).until(
+                        EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
+                    )
+
+            if not can_not_be_done_headless:
+                html = sanitize_input_encode(self.driver.page_source)
+            self.driver = self.execute_hook("before_return_html", self.driver, html)
+
+            # Store in cache
+            cache_file_path = os.path.join(
+                os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()),
+                ".crawl4ai",
+                "cache",
+                url_hash,
+            )
+            with open(cache_file_path, "w", encoding="utf-8") as f:
+                f.write(html)
+
+            if self.verbose:
+                print(f"[LOG] ✅ Crawled {url} successfully!")
+
+            return html
+        except InvalidArgumentException as e:
+            if not hasattr(e, "msg"):
+                e.msg = sanitize_input_encode(str(e))
+            raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
+        except WebDriverException as e:
+            # If e does nlt have msg attribute create it and set it to str(e)
+            if not hasattr(e, "msg"):
+                e.msg = sanitize_input_encode(str(e))
+            raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
+        except Exception as e:
+            if not hasattr(e, "msg"):
+                e.msg = sanitize_input_encode(str(e))
+            raise Exception(f"Failed to crawl {url}: {e.msg}")
+
+    def take_screenshot(self) -> str:
+        try:
+            # Get the dimensions of the page
+            total_width = self.driver.execute_script("return document.body.scrollWidth")
+            total_height = self.driver.execute_script(
+                "return document.body.scrollHeight"
+            )
+
+            # Set the window size to the dimensions of the page
+            self.driver.set_window_size(total_width, total_height)
+
+            # Take screenshot
+            screenshot = self.driver.get_screenshot_as_png()
+
+            # Open the screenshot with PIL
+            image = Image.open(BytesIO(screenshot))
+
+            # Convert image to RGB mode (this will handle both RGB and RGBA images)
+            rgb_image = image.convert("RGB")
+
+            # Convert to JPEG and compress
+            buffered = BytesIO()
+            rgb_image.save(buffered, format="JPEG", quality=85)
+            img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+            if self.verbose:
+                print("[LOG] 📸 Screenshot taken and converted to base64")
+
+            return img_base64
+        except Exception as e:
+            error_message = sanitize_input_encode(
+                f"Failed to take screenshot: {str(e)}"
+            )
+            print(error_message)
+
+            # Generate an image with black background
+            img = Image.new("RGB", (800, 600), color="black")
+            draw = ImageDraw.Draw(img)
+
+            # Load a font
+            try:
+                font = ImageFont.truetype("arial.ttf", 40)
+            except IOError:
+                font = ImageFont.load_default()
+
+            # Define text color and wrap the text
+            text_color = (255, 255, 255)
+            max_width = 780
+            wrapped_text = wrap_text(draw, error_message, font, max_width)
+
+            # Calculate text position
+            text_position = (10, 10)
+
+            # Draw the text on the image
+            draw.text(text_position, wrapped_text, fill=text_color, font=font)
+
+            # Convert to base64
+            buffered = BytesIO()
+            img.save(buffered, format="JPEG")
+            img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+            return img_base64
+
+    def quit(self):
+        self.driver.quit()
--- a/crawl4ai/legacy/database.py
+++ b/crawl4ai/legacy/database.py
@@ -0,0 +1,180 @@
+import os
+from pathlib import Path
+import sqlite3
+from typing import Optional, Tuple
+
+DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+os.makedirs(DB_PATH, exist_ok=True)
+DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
+
+
+def init_db():
+    global DB_PATH
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+    cursor.execute(
+        """
+        CREATE TABLE IF NOT EXISTS crawled_data (
+            url TEXT PRIMARY KEY,
+            html TEXT,
+            cleaned_html TEXT,
+            markdown TEXT,
+            extracted_content TEXT,
+            success BOOLEAN,
+            media TEXT DEFAULT "{}",
+            links TEXT DEFAULT "{}",
+            metadata TEXT DEFAULT "{}",
+            screenshot TEXT DEFAULT ""
+        )
+    """
+    )
+    conn.commit()
+    conn.close()
+
+
+def alter_db_add_screenshot(new_column: str = "media"):
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute(
+            f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""'
+        )
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error altering database to add screenshot column: {e}")
+
+
+def check_db_path():
+    if not DB_PATH:
+        raise ValueError("Database path is not set or is empty.")
+
+
+def get_cached_url(
+    url: str,
+) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute(
+            "SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?",
+            (url,),
+        )
+        result = cursor.fetchone()
+        conn.close()
+        return result
+    except Exception as e:
+        print(f"Error retrieving cached URL: {e}")
+        return None
+
+
+def cache_url(
+    url: str,
+    html: str,
+    cleaned_html: str,
+    markdown: str,
+    extracted_content: str,
+    success: bool,
+    media: str = "{}",
+    links: str = "{}",
+    metadata: str = "{}",
+    screenshot: str = "",
+):
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute(
+            """
+            INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(url) DO UPDATE SET
+                html = excluded.html,
+                cleaned_html = excluded.cleaned_html,
+                markdown = excluded.markdown,
+                extracted_content = excluded.extracted_content,
+                success = excluded.success,
+                media = excluded.media,      
+                links = excluded.links,    
+                metadata = excluded.metadata,      
+                screenshot = excluded.screenshot
+        """,
+            (
+                url,
+                html,
+                cleaned_html,
+                markdown,
+                extracted_content,
+                success,
+                media,
+                links,
+                metadata,
+                screenshot,
+            ),
+        )
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error caching URL: {e}")
+
+
+def get_total_count() -> int:
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute("SELECT COUNT(*) FROM crawled_data")
+        result = cursor.fetchone()
+        conn.close()
+        return result[0]
+    except Exception as e:
+        print(f"Error getting total count: {e}")
+        return 0
+
+
+def clear_db():
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute("DELETE FROM crawled_data")
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error clearing database: {e}")
+
+
+def flush_db():
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute("DROP TABLE crawled_data")
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error flushing database: {e}")
+
+
+def update_existing_records(new_column: str = "media", default_value: str = "{}"):
+    check_db_path()
+    try:
+        conn = sqlite3.connect(DB_PATH)
+        cursor = conn.cursor()
+        cursor.execute(
+            f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL'
+        )
+        conn.commit()
+        conn.close()
+    except Exception as e:
+        print(f"Error updating existing records: {e}")
+
+
+if __name__ == "__main__":
+    # Delete the existing database file
+    if os.path.exists(DB_PATH):
+        os.remove(DB_PATH)
+    init_db()
+    # alter_db_add_screenshot("COL_NAME")
--- a/crawl4ai/legacy/docs_manager.py
+++ b/crawl4ai/legacy/docs_manager.py
@@ -0,0 +1,75 @@
+import requests
+import shutil
+from pathlib import Path
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.llmtxt import AsyncLLMTextManager
+
+
+class DocsManager:
+    def __init__(self, logger=None):
+        self.docs_dir = Path.home() / ".crawl4ai" / "docs"
+        self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt"
+        self.docs_dir.mkdir(parents=True, exist_ok=True)
+        self.logger = logger or AsyncLogger(verbose=True)
+        self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger)
+
+    async def ensure_docs_exist(self):
+        """Fetch docs if not present"""
+        if not any(self.docs_dir.iterdir()):
+            await self.fetch_docs()
+
+    async def fetch_docs(self) -> bool:
+        """Copy from local docs or download from GitHub"""
+        try:
+            # Try local first
+            if self.local_docs.exists() and (
+                any(self.local_docs.glob("*.md"))
+                or any(self.local_docs.glob("*.tokens"))
+            ):
+                # Empty the local docs directory
+                for file_path in self.docs_dir.glob("*.md"):
+                    file_path.unlink()
+                # for file_path in self.docs_dir.glob("*.tokens"):
+                #     file_path.unlink()
+                for file_path in self.local_docs.glob("*.md"):
+                    shutil.copy2(file_path, self.docs_dir / file_path.name)
+                # for file_path in self.local_docs.glob("*.tokens"):
+                #     shutil.copy2(file_path, self.docs_dir / file_path.name)
+                return True
+
+            # Fallback to GitHub
+            response = requests.get(
+                "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
+                headers={"Accept": "application/vnd.github.v3+json"},
+            )
+            response.raise_for_status()
+
+            for item in response.json():
+                if item["type"] == "file" and item["name"].endswith(".md"):
+                    content = requests.get(item["download_url"]).text
+                    with open(self.docs_dir / item["name"], "w", encoding="utf-8") as f:
+                        f.write(content)
+            return True
+
+        except Exception as e:
+            self.logger.error(f"Failed to fetch docs: {str(e)}")
+            raise
+
+    def list(self) -> list[str]:
+        """List available topics"""
+        names = [file_path.stem for file_path in self.docs_dir.glob("*.md")]
+        # Remove [0-9]+_ prefix
+        names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
+        # Exclude those end with .xs.md and .q.md
+        names = [
+            name
+            for name in names
+            if not name.endswith(".xs") and not name.endswith(".q")
+        ]
+        return names
+
+    def generate(self, sections, mode="extended"):
+        return self.llm_text.generate(sections, mode)
+
+    def search(self, query: str, top_k: int = 5):
+        return self.llm_text.search(query, top_k)
--- a/crawl4ai/legacy/llmtxt.py
+++ b/crawl4ai/legacy/llmtxt.py
@@ -0,0 +1,546 @@
+import os
+from pathlib import Path
+import re
+from typing import Dict, List, Tuple, Optional, Any
+import json
+from tqdm import tqdm
+import time
+import psutil
+import numpy as np
+from rank_bm25 import BM25Okapi
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from litellm import batch_completion
+from .async_logger import AsyncLogger
+import litellm
+import pickle
+import hashlib  # <--- ADDED for file-hash
+import glob
+
+litellm.set_verbose = False
+
+
+def _compute_file_hash(file_path: Path) -> str:
+    """Compute MD5 hash for the file's entire content."""
+    hash_md5 = hashlib.md5()
+    with file_path.open("rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+class AsyncLLMTextManager:
+    def __init__(
+        self,
+        docs_dir: Path,
+        logger: Optional[AsyncLogger] = None,
+        max_concurrent_calls: int = 5,
+        batch_size: int = 3,
+    ) -> None:
+        self.docs_dir = docs_dir
+        self.logger = logger
+        self.max_concurrent_calls = max_concurrent_calls
+        self.batch_size = batch_size
+        self.bm25_index = None
+        self.document_map: Dict[str, Any] = {}
+        self.tokenized_facts: List[str] = []
+        self.bm25_index_file = self.docs_dir / "bm25_index.pkl"
+
+    async def _process_document_batch(self, doc_batch: List[Path]) -> None:
+        """Process a batch of documents in parallel"""
+        contents = []
+        for file_path in doc_batch:
+            try:
+                with open(file_path, "r", encoding="utf-8") as f:
+                    contents.append(f.read())
+            except Exception as e:
+                self.logger.error(f"Error reading {file_path}: {str(e)}")
+                contents.append("")  # Add empty content to maintain batch alignment
+
+        prompt = """Given a documentation file, generate a list of atomic facts where each fact:
+1. Represents a single piece of knowledge
+2. Contains variations in terminology for the same concept
+3. References relevant code patterns if they exist
+4. Is written in a way that would match natural language queries
+
+Each fact should follow this format:
+<main_concept>: <fact_statement> | <related_terms> | <code_reference>
+
+Example Facts:
+browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
+redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
+pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]
+
+Wrap your response in <index>...</index> tags.
+"""
+
+        # Prepare messages for batch processing
+        messages_list = [
+            [
+                {
+                    "role": "user",
+                    "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}",
+                }
+            ]
+            for content in contents
+            if content
+        ]
+
+        try:
+            responses = batch_completion(
+                model="anthropic/claude-3-5-sonnet-latest",
+                messages=messages_list,
+                logger_fn=None,
+            )
+
+            # Process responses and save index files
+            for response, file_path in zip(responses, doc_batch):
+                try:
+                    index_content_match = re.search(
+                        r"<index>(.*?)</index>",
+                        response.choices[0].message.content,
+                        re.DOTALL,
+                    )
+                    if not index_content_match:
+                        self.logger.warning(
+                            f"No <index>...</index> content found for {file_path}"
+                        )
+                        continue
+
+                    index_content = re.sub(
+                        r"\n\s*\n", "\n", index_content_match.group(1)
+                    ).strip()
+                    if index_content:
+                        index_file = file_path.with_suffix(".q.md")
+                        with open(index_file, "w", encoding="utf-8") as f:
+                            f.write(index_content)
+                        self.logger.info(f"Created index file: {index_file}")
+                    else:
+                        self.logger.warning(
+                            f"No index content found in response for {file_path}"
+                        )
+
+                except Exception as e:
+                    self.logger.error(
+                        f"Error processing response for {file_path}: {str(e)}"
+                    )
+
+        except Exception as e:
+            self.logger.error(f"Error in batch completion: {str(e)}")
+
+    def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]:
+        if "|" not in line:
+            return False, "Missing separator '|'"
+
+        parts = [p.strip() for p in line.split("|")]
+        if len(parts) != 3:
+            return False, f"Expected 3 parts, got {len(parts)}"
+
+        concept_part = parts[0]
+        if ":" not in concept_part:
+            return False, "Missing ':' in concept definition"
+
+        return True, None
+
+    def _load_or_create_token_cache(self, fact_file: Path) -> Dict:
+        """
+        Load token cache from .q.tokens if present and matching file hash.
+        Otherwise return a new structure with updated file-hash.
+        """
+        cache_file = fact_file.with_suffix(".q.tokens")
+        current_hash = _compute_file_hash(fact_file)
+
+        if cache_file.exists():
+            try:
+                with open(cache_file, "r") as f:
+                    cache = json.load(f)
+                # If the hash matches, return it directly
+                if cache.get("content_hash") == current_hash:
+                    return cache
+                # Otherwise, we signal that it's changed
+                self.logger.info(f"Hash changed for {fact_file}, reindex needed.")
+            except json.JSONDecodeError:
+                self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.")
+            except Exception as e:
+                self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}")
+
+        # Return a fresh cache
+        return {"facts": {}, "content_hash": current_hash}
+
+    def _save_token_cache(self, fact_file: Path, cache: Dict) -> None:
+        cache_file = fact_file.with_suffix(".q.tokens")
+        # Always ensure we're saving the correct file-hash
+        cache["content_hash"] = _compute_file_hash(fact_file)
+        with open(cache_file, "w") as f:
+            json.dump(cache, f)
+
+    def preprocess_text(self, text: str) -> List[str]:
+        parts = [x.strip() for x in text.split("|")] if "|" in text else [text]
+        # Remove : after the first word of parts[0]
+        parts[0] = re.sub(r"^(.*?):", r"\1", parts[0])
+
+        lemmatizer = WordNetLemmatizer()
+        stop_words = set(stopwords.words("english")) - {
+            "how",
+            "what",
+            "when",
+            "where",
+            "why",
+            "which",
+        }
+
+        tokens = []
+        for part in parts:
+            if "(" in part and ")" in part:
+                code_tokens = re.findall(
+                    r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part
+                )
+                tokens.extend(code_tokens)
+
+            words = word_tokenize(part.lower())
+            tokens.extend(
+                [
+                    lemmatizer.lemmatize(token)
+                    for token in words
+                    if token not in stop_words
+                ]
+            )
+
+        return tokens
+
+    def maybe_load_bm25_index(self, clear_cache=False) -> bool:
+        """
+        Load existing BM25 index from disk, if present and clear_cache=False.
+        """
+        if not clear_cache and os.path.exists(self.bm25_index_file):
+            self.logger.info("Loading existing BM25 index from disk.")
+            with open(self.bm25_index_file, "rb") as f:
+                data = pickle.load(f)
+            self.tokenized_facts = data["tokenized_facts"]
+            self.bm25_index = data["bm25_index"]
+            return True
+        return False
+
+    def build_search_index(self, clear_cache=False) -> None:
+        """
+        Checks for new or modified .q.md files by comparing file-hash.
+        If none need reindexing and clear_cache is False, loads existing index if available.
+        Otherwise, reindexes only changed/new files and merges or creates a new index.
+        """
+        # If clear_cache is True, we skip partial logic: rebuild everything from scratch
+        if clear_cache:
+            self.logger.info("Clearing cache and rebuilding full search index.")
+            if self.bm25_index_file.exists():
+                self.bm25_index_file.unlink()
+
+        process = psutil.Process()
+        self.logger.info("Checking which .q.md files need (re)indexing...")
+
+        # Gather all .q.md files
+        q_files = [
+            self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")
+        ]
+
+        # We'll store known (unchanged) facts in these lists
+        existing_facts: List[str] = []
+        existing_tokens: List[List[str]] = []
+
+        # Keep track of invalid lines for logging
+        invalid_lines = []
+        needSet = []  # files that must be (re)indexed
+
+        for qf in q_files:
+            token_cache_file = qf.with_suffix(".q.tokens")
+
+            # If no .q.tokens or clear_cache is True → definitely reindex
+            if clear_cache or not token_cache_file.exists():
+                needSet.append(qf)
+                continue
+
+            # Otherwise, load the existing cache and compare hash
+            cache = self._load_or_create_token_cache(qf)
+            # If the .q.tokens was out of date (i.e. changed hash), we reindex
+            if len(cache["facts"]) == 0 or cache.get(
+                "content_hash"
+            ) != _compute_file_hash(qf):
+                needSet.append(qf)
+            else:
+                # File is unchanged → retrieve cached token data
+                for line, cache_data in cache["facts"].items():
+                    existing_facts.append(line)
+                    existing_tokens.append(cache_data["tokens"])
+                    self.document_map[line] = qf  # track the doc for that fact
+
+        if not needSet and not clear_cache:
+            # If no file needs reindexing, try loading existing index
+            if self.maybe_load_bm25_index(clear_cache=False):
+                self.logger.info(
+                    "No new/changed .q.md files found. Using existing BM25 index."
+                )
+                return
+            else:
+                # If there's no existing index, we must build a fresh index from the old caches
+                self.logger.info(
+                    "No existing BM25 index found. Building from cached facts."
+                )
+                if existing_facts:
+                    self.logger.info(
+                        f"Building BM25 index with {len(existing_facts)} cached facts."
+                    )
+                    self.bm25_index = BM25Okapi(existing_tokens)
+                    self.tokenized_facts = existing_facts
+                    with open(self.bm25_index_file, "wb") as f:
+                        pickle.dump(
+                            {
+                                "bm25_index": self.bm25_index,
+                                "tokenized_facts": self.tokenized_facts,
+                            },
+                            f,
+                        )
+                else:
+                    self.logger.warning("No facts found at all. Index remains empty.")
+                return
+
+        # ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md'
+        # If we reach here, we have new or changed .q.md files
+        # We'll parse them, reindex them, and then combine with existing_facts
+        # -----------------------------------------------------
+
+        self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...")
+
+        # 1) Parse the new or changed .q.md files
+        new_facts = []
+        new_tokens = []
+        with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar:
+            for file in needSet:
+                # We'll build up a fresh cache
+                fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)}
+                try:
+                    with open(file, "r", encoding="utf-8") as f_obj:
+                        content = f_obj.read().strip()
+                        lines = [l.strip() for l in content.split("\n") if l.strip()]
+
+                    for line in lines:
+                        is_valid, error = self._validate_fact_line(line)
+                        if not is_valid:
+                            invalid_lines.append((file, line, error))
+                            continue
+
+                        tokens = self.preprocess_text(line)
+                        fresh_cache["facts"][line] = {
+                            "tokens": tokens,
+                            "added": time.time(),
+                        }
+                        new_facts.append(line)
+                        new_tokens.append(tokens)
+                        self.document_map[line] = file
+
+                    # Save the new .q.tokens with updated hash
+                    self._save_token_cache(file, fresh_cache)
+
+                    mem_usage = process.memory_info().rss / 1024 / 1024
+                    self.logger.debug(
+                        f"Memory usage after {file.name}: {mem_usage:.2f}MB"
+                    )
+
+                except Exception as e:
+                    self.logger.error(f"Error processing {file}: {str(e)}")
+
+                file_pbar.update(1)
+
+        if invalid_lines:
+            self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:")
+            for file, line, error in invalid_lines:
+                self.logger.warning(f"{file}: {error} in line: {line[:50]}...")
+
+        # 2) Merge newly tokenized facts with the existing ones
+        all_facts = existing_facts + new_facts
+        all_tokens = existing_tokens + new_tokens
+
+        # 3) Build BM25 index from combined facts
+        self.logger.info(
+            f"Building BM25 index with {len(all_facts)} total facts (old + new)."
+        )
+        self.bm25_index = BM25Okapi(all_tokens)
+        self.tokenized_facts = all_facts
+
+        # 4) Save the updated BM25 index to disk
+        with open(self.bm25_index_file, "wb") as f:
+            pickle.dump(
+                {
+                    "bm25_index": self.bm25_index,
+                    "tokenized_facts": self.tokenized_facts,
+                },
+                f,
+            )
+
+        final_mem = process.memory_info().rss / 1024 / 1024
+        self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB")
+
+    async def generate_index_files(
+        self, force_generate_facts: bool = False, clear_bm25_cache: bool = False
+    ) -> None:
+        """
+        Generate index files for all documents in parallel batches
+
+        Args:
+            force_generate_facts (bool): If True, regenerate indexes even if they exist
+            clear_bm25_cache (bool): If True, clear existing BM25 index cache
+        """
+        self.logger.info("Starting index generation for documentation files.")
+
+        md_files = [
+            self.docs_dir / f
+            for f in os.listdir(self.docs_dir)
+            if f.endswith(".md") and not any(f.endswith(x) for x in [".q.md", ".xs.md"])
+        ]
+
+        # Filter out files that already have .q files unless force=True
+        if not force_generate_facts:
+            md_files = [
+                f
+                for f in md_files
+                if not (self.docs_dir / f.name.replace(".md", ".q.md")).exists()
+            ]
+
+        if not md_files:
+            self.logger.info("All index files exist. Use force=True to regenerate.")
+        else:
+            # Process documents in batches
+            for i in range(0, len(md_files), self.batch_size):
+                batch = md_files[i : i + self.batch_size]
+                self.logger.info(
+                    f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}"
+                )
+                await self._process_document_batch(batch)
+
+        self.logger.info("Index generation complete, building/updating search index.")
+        self.build_search_index(clear_cache=clear_bm25_cache)
+
+    def generate(self, sections: List[str], mode: str = "extended") -> str:
+        # Get all markdown files
+        all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + glob.glob(
+            str(self.docs_dir / "[0-9]*.xs.md")
+        )
+
+        # Extract base names without extensions
+        base_docs = {
+            Path(f).name.split(".")[0]
+            for f in all_files
+            if not Path(f).name.endswith(".q.md")
+        }
+
+        # Filter by sections if provided
+        if sections:
+            base_docs = {
+                doc
+                for doc in base_docs
+                if any(section.lower() in doc.lower() for section in sections)
+            }
+
+        # Get file paths based on mode
+        files = []
+        for doc in sorted(
+            base_docs,
+            key=lambda x: int(x.split("_")[0]) if x.split("_")[0].isdigit() else 999999,
+        ):
+            if mode == "condensed":
+                xs_file = self.docs_dir / f"{doc}.xs.md"
+                regular_file = self.docs_dir / f"{doc}.md"
+                files.append(str(xs_file if xs_file.exists() else regular_file))
+            else:
+                files.append(str(self.docs_dir / f"{doc}.md"))
+
+        # Read and format content
+        content = []
+        for file in files:
+            try:
+                with open(file, "r", encoding="utf-8") as f:
+                    fname = Path(file).name
+                    content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}")
+            except Exception as e:
+                self.logger.error(f"Error reading {file}: {str(e)}")
+
+        return "\n\n---\n\n".join(content) if content else ""
+
+    def search(self, query: str, top_k: int = 5) -> str:
+        if not self.bm25_index:
+            return "No search index available. Call build_search_index() first."
+
+        query_tokens = self.preprocess_text(query)
+        doc_scores = self.bm25_index.get_scores(query_tokens)
+
+        mean_score = np.mean(doc_scores)
+        std_score = np.std(doc_scores)
+        score_threshold = mean_score + (0.25 * std_score)
+
+        file_data = self._aggregate_search_scores(
+            doc_scores=doc_scores,
+            score_threshold=score_threshold,
+            query_tokens=query_tokens,
+        )
+
+        ranked_files = sorted(
+            file_data.items(),
+            key=lambda x: (
+                x[1]["code_match_score"] * 2.0
+                + x[1]["match_count"] * 1.5
+                + x[1]["total_score"]
+            ),
+            reverse=True,
+        )[:top_k]
+
+        results = []
+        for file, _ in ranked_files:
+            main_doc = str(file).replace(".q.md", ".md")
+            if os.path.exists(self.docs_dir / main_doc):
+                with open(self.docs_dir / main_doc, "r", encoding="utf-8") as f:
+                    only_file_name = main_doc.split("/")[-1]
+                    content = ["#" * 20, f"# {only_file_name}", "#" * 20, "", f.read()]
+                    results.append("\n".join(content))
+
+        return "\n\n---\n\n".join(results)
+
+    def _aggregate_search_scores(
+        self, doc_scores: List[float], score_threshold: float, query_tokens: List[str]
+    ) -> Dict:
+        file_data = {}
+
+        for idx, score in enumerate(doc_scores):
+            if score <= score_threshold:
+                continue
+
+            fact = self.tokenized_facts[idx]
+            file_path = self.document_map[fact]
+
+            if file_path not in file_data:
+                file_data[file_path] = {
+                    "total_score": 0,
+                    "match_count": 0,
+                    "code_match_score": 0,
+                    "matched_facts": [],
+                }
+
+            components = fact.split("|") if "|" in fact else [fact]
+
+            code_match_score = 0
+            if len(components) == 3:
+                code_ref = components[2].strip()
+                code_tokens = self.preprocess_text(code_ref)
+                code_match_score = len(set(query_tokens) & set(code_tokens)) / len(
+                    query_tokens
+                )
+
+            file_data[file_path]["total_score"] += score
+            file_data[file_path]["match_count"] += 1
+            file_data[file_path]["code_match_score"] = max(
+                file_data[file_path]["code_match_score"], code_match_score
+            )
+            file_data[file_path]["matched_facts"].append(fact)
+
+        return file_data
+
+    def refresh_index(self) -> None:
+        """Convenience method for a full rebuild."""
+        self.build_search_index(clear_cache=True)
--- a/crawl4ai/legacy/version_manager.py
+++ b/crawl4ai/legacy/version_manager.py
@@ -0,0 +1,29 @@
+# version_manager.py
+from pathlib import Path
+from packaging import version
+from . import __version__
+
+
+class VersionManager:
+    def __init__(self):
+        self.home_dir = Path.home() / ".crawl4ai"
+        self.version_file = self.home_dir / "version.txt"
+
+    def get_installed_version(self):
+        """Get the version recorded in home directory"""
+        if not self.version_file.exists():
+            return None
+        try:
+            return version.parse(self.version_file.read_text().strip())
+        except:
+            return None
+
+    def update_version(self):
+        """Update the version file to current library version"""
+        self.version_file.write_text(__version__.__version__)
+
+    def needs_update(self):
+        """Check if database needs update based on version"""
+        installed = self.get_installed_version()
+        current = version.parse(__version__.__version__)
+        return installed is None or installed < current
--- a/crawl4ai/legacy/web_crawler.py
+++ b/crawl4ai/legacy/web_crawler.py
@@ -0,0 +1,294 @@
+import os, time
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from pathlib import Path
+
+from .models import UrlModel, CrawlResult
+from .database import init_db, get_cached_url, cache_url
+from .utils import *
+from .chunking_strategy import *
+from .extraction_strategy import *
+from .crawler_strategy import *
+from typing import List
+from concurrent.futures import ThreadPoolExecutor
+from .content_scraping_strategy import WebScrapingStrategy
+from .config import *
+import warnings
+import json
+
+warnings.filterwarnings(
+    "ignore",
+    message='Field "model_name" has conflict with protected namespace "model_".',
+)
+
+
+class WebCrawler:
+    def __init__(
+        self,
+        crawler_strategy: CrawlerStrategy = None,
+        always_by_pass_cache: bool = False,
+        verbose: bool = False,
+    ):
+        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(
+            verbose=verbose
+        )
+        self.always_by_pass_cache = always_by_pass_cache
+        self.crawl4ai_folder = os.path.join(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
+        )
+        os.makedirs(self.crawl4ai_folder, exist_ok=True)
+        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
+        init_db()
+        self.ready = False
+
+    def warmup(self):
+        print("[LOG] 🌤️  Warming up the WebCrawler")
+        self.run(
+            url="https://google.com/",
+            word_count_threshold=5,
+            extraction_strategy=NoExtractionStrategy(),
+            bypass_cache=False,
+            verbose=False,
+        )
+        self.ready = True
+        print("[LOG] 🌞 WebCrawler is ready to crawl")
+
+    def fetch_page(
+        self,
+        url_model: UrlModel,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: str = None,
+        extract_blocks_flag: bool = True,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        css_selector: str = None,
+        screenshot: bool = False,
+        use_cached_html: bool = False,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        **kwargs,
+    ) -> CrawlResult:
+        return self.run(
+            url_model.url,
+            word_count_threshold,
+            extraction_strategy or NoExtractionStrategy(),
+            chunking_strategy,
+            bypass_cache=url_model.forced,
+            css_selector=css_selector,
+            screenshot=screenshot,
+            **kwargs,
+        )
+        pass
+
+    def fetch_pages(
+        self,
+        url_models: List[UrlModel],
+        provider: str = DEFAULT_PROVIDER,
+        api_token: str = None,
+        extract_blocks_flag: bool = True,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        use_cached_html: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        **kwargs,
+    ) -> List[CrawlResult]:
+        extraction_strategy = extraction_strategy or NoExtractionStrategy()
+
+        def fetch_page_wrapper(url_model, *args, **kwargs):
+            return self.fetch_page(url_model, *args, **kwargs)
+
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(
+                    fetch_page_wrapper,
+                    url_models,
+                    [provider] * len(url_models),
+                    [api_token] * len(url_models),
+                    [extract_blocks_flag] * len(url_models),
+                    [word_count_threshold] * len(url_models),
+                    [css_selector] * len(url_models),
+                    [screenshot] * len(url_models),
+                    [use_cached_html] * len(url_models),
+                    [extraction_strategy] * len(url_models),
+                    [chunking_strategy] * len(url_models),
+                    *[kwargs] * len(url_models),
+                )
+            )
+
+        return results
+
+    def run(
+        self,
+        url: str,
+        word_count_threshold=MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        bypass_cache: bool = False,
+        css_selector: str = None,
+        screenshot: bool = False,
+        user_agent: str = None,
+        verbose=True,
+        **kwargs,
+    ) -> CrawlResult:
+        try:
+            extraction_strategy = extraction_strategy or NoExtractionStrategy()
+            extraction_strategy.verbose = verbose
+            if not isinstance(extraction_strategy, ExtractionStrategy):
+                raise ValueError("Unsupported extraction strategy")
+            if not isinstance(chunking_strategy, ChunkingStrategy):
+                raise ValueError("Unsupported chunking strategy")
+
+            word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
+
+            cached = None
+            screenshot_data = None
+            extracted_content = None
+            if not bypass_cache and not self.always_by_pass_cache:
+                cached = get_cached_url(url)
+
+            if kwargs.get("warmup", True) and not self.ready:
+                return None
+
+            if cached:
+                html = sanitize_input_encode(cached[1])
+                extracted_content = sanitize_input_encode(cached[4])
+                if screenshot:
+                    screenshot_data = cached[9]
+                    if not screenshot_data:
+                        cached = None
+
+            if not cached or not html:
+                if user_agent:
+                    self.crawler_strategy.update_user_agent(user_agent)
+                t1 = time.time()
+                html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
+                t2 = time.time()
+                if verbose:
+                    print(
+                        f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
+                    )
+                if screenshot:
+                    screenshot_data = self.crawler_strategy.take_screenshot()
+
+            crawl_result = self.process_html(
+                url,
+                html,
+                extracted_content,
+                word_count_threshold,
+                extraction_strategy,
+                chunking_strategy,
+                css_selector,
+                screenshot_data,
+                verbose,
+                bool(cached),
+                **kwargs,
+            )
+            crawl_result.success = bool(html)
+            return crawl_result
+        except Exception as e:
+            if not hasattr(e, "msg"):
+                e.msg = str(e)
+            print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
+            return CrawlResult(url=url, html="", success=False, error_message=e.msg)
+
+    def process_html(
+        self,
+        url: str,
+        html: str,
+        extracted_content: str,
+        word_count_threshold: int,
+        extraction_strategy: ExtractionStrategy,
+        chunking_strategy: ChunkingStrategy,
+        css_selector: str,
+        screenshot: bool,
+        verbose: bool,
+        is_cached: bool,
+        **kwargs,
+    ) -> CrawlResult:
+        t = time.time()
+        # Extract content from HTML
+        try:
+            t1 = time.time()
+            scrapping_strategy = WebScrapingStrategy()
+            extra_params = {
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["only_text", "image_description_min_word_threshold"]
+            }
+            result = scrapping_strategy.scrap(
+                url,
+                html,
+                word_count_threshold=word_count_threshold,
+                css_selector=css_selector,
+                only_text=kwargs.get("only_text", False),
+                image_description_min_word_threshold=kwargs.get(
+                    "image_description_min_word_threshold",
+                    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+                ),
+                **extra_params,
+            )
+
+            # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
+            if verbose:
+                print(
+                    f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
+                )
+
+            if result is None:
+                raise ValueError(f"Failed to extract content from the website: {url}")
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+
+        cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+        markdown = sanitize_input_encode(result.get("markdown", ""))
+        media = result.get("media", [])
+        links = result.get("links", [])
+        metadata = result.get("metadata", {})
+
+        if extracted_content is None:
+            if verbose:
+                print(
+                    f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}"
+                )
+
+            sections = chunking_strategy.chunk(markdown)
+            extracted_content = extraction_strategy.run(url, sections)
+            extracted_content = json.dumps(
+                extracted_content, indent=4, default=str, ensure_ascii=False
+            )
+
+            if verbose:
+                print(
+                    f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
+                )
+
+        screenshot = None if not screenshot else screenshot
+
+        if not is_cached:
+            cache_url(
+                url,
+                html,
+                cleaned_html,
+                markdown,
+                extracted_content,
+                True,
+                json.dumps(media),
+                json.dumps(links),
+                json.dumps(metadata),
+                screenshot=screenshot,
+            )
+
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=format_html(cleaned_html),
+            markdown=markdown,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot,
+            extracted_content=extracted_content,
+            success=True,
+            error_message="",
+        )