refactor(core): reorganize project structure and remove legacy code

Major reorganization of the project structure:
- Moved legacy synchronous crawler code to legacy folder
- Removed deprecated CLI and docs manager
- Consolidated version manager into utils.py
- Added CrawlerHub to __init__.py exports
- Fixed type hints in async_webcrawler.py
- Fixed minor bugs in chunking and crawler strategies

BREAKING CHANGE: Removed synchronous WebCrawler, CLI, and docs management functionality. Users should migrate to AsyncWebCrawler.
This commit is contained in:
UncleCode
2025-01-30 19:35:06 +08:00
parent 31938fb922
commit f81712eb91
23 changed files with 425 additions and 4 deletions

View File

123
crawl4ai/legacy/cli.py Normal file
View File

@@ -0,0 +1,123 @@
import click
import sys
import asyncio
from typing import List
from .docs_manager import DocsManager
from .async_logger import AsyncLogger
logger = AsyncLogger(verbose=True)
docs_manager = DocsManager(logger)
def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
"""Print formatted table with headers and rows"""
widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
border = "+" + "+".join("-" * (w + 2 * padding) for w in widths) + "+"
def format_row(row):
return (
"|"
+ "|".join(
f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
for cell, w in zip(row, widths)
)
+ "|"
)
click.echo(border)
click.echo(format_row(headers))
click.echo(border)
for row in rows:
click.echo(format_row(row))
click.echo(border)
@click.group()
def cli():
"""Crawl4AI Command Line Interface"""
pass
@cli.group()
def docs():
"""Documentation operations"""
pass
@docs.command()
@click.argument("sections", nargs=-1)
@click.option(
"--mode", type=click.Choice(["extended", "condensed"]), default="extended"
)
def combine(sections: tuple, mode: str):
"""Combine documentation sections"""
try:
asyncio.run(docs_manager.ensure_docs_exist())
click.echo(docs_manager.generate(sections, mode))
except Exception as e:
logger.error(str(e), tag="ERROR")
sys.exit(1)
@docs.command()
@click.argument("query")
@click.option("--top-k", "-k", default=5)
@click.option("--build-index", is_flag=True, help="Build index if missing")
def search(query: str, top_k: int, build_index: bool):
"""Search documentation"""
try:
result = docs_manager.search(query, top_k)
if result == "No search index available. Call build_search_index() first.":
if build_index or click.confirm("No search index found. Build it now?"):
asyncio.run(docs_manager.llm_text.generate_index_files())
result = docs_manager.search(query, top_k)
click.echo(result)
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
@docs.command()
def update():
"""Update docs from GitHub"""
try:
asyncio.run(docs_manager.fetch_docs())
click.echo("Documentation updated successfully")
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
@docs.command()
@click.option("--force-facts", is_flag=True, help="Force regenerate fact files")
@click.option("--clear-cache", is_flag=True, help="Clear BM25 cache")
def index(force_facts: bool, clear_cache: bool):
"""Build or rebuild search indexes"""
try:
asyncio.run(docs_manager.ensure_docs_exist())
asyncio.run(
docs_manager.llm_text.generate_index_files(
force_generate_facts=force_facts, clear_bm25_cache=clear_cache
)
)
click.echo("Search indexes built successfully")
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
# Add docs list command
@docs.command()
def list():
"""List available documentation sections"""
try:
sections = docs_manager.list()
print_table(["Sections"], [[section] for section in sections])
except Exception as e:
click.echo(f"Error: {str(e)}", err=True)
sys.exit(1)
if __name__ == "__main__":
cli()

View File

@@ -0,0 +1,394 @@
from abc import ABC, abstractmethod
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import InvalidArgumentException, WebDriverException
# from selenium.webdriver.chrome.service import Service as ChromeService
# from webdriver_manager.chrome import ChromeDriverManager
# from urllib3.exceptions import MaxRetryError
from .config import *
import logging, time
import base64
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO
from typing import Callable
import requests
import os
from pathlib import Path
from .utils import *
logger = logging.getLogger("selenium.webdriver.remote.remote_connection")
logger.setLevel(logging.WARNING)
logger_driver = logging.getLogger("selenium.webdriver.common.service")
logger_driver.setLevel(logging.WARNING)
urllib3_logger = logging.getLogger("urllib3.connectionpool")
urllib3_logger.setLevel(logging.WARNING)
# Disable http.client logging
http_client_logger = logging.getLogger("http.client")
http_client_logger.setLevel(logging.WARNING)
# Disable driver_finder and service logging
driver_finder_logger = logging.getLogger("selenium.webdriver.common.driver_finder")
driver_finder_logger.setLevel(logging.WARNING)
class CrawlerStrategy(ABC):
@abstractmethod
def crawl(self, url: str, **kwargs) -> str:
pass
@abstractmethod
def take_screenshot(self, save_path: str):
pass
@abstractmethod
def update_user_agent(self, user_agent: str):
pass
@abstractmethod
def set_hook(self, hook_type: str, hook: Callable):
pass
class CloudCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html=False):
super().__init__()
self.use_cached_html = use_cached_html
def crawl(self, url: str) -> str:
data = {
"urls": [url],
"include_raw_html": True,
"forced": True,
"extract_blocks": False,
}
response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
response = response.json()
html = response["results"][0]["html"]
return sanitize_input_encode(html)
class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
def __init__(self, use_cached_html=False, js_code=None, **kwargs):
super().__init__()
print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
self.options = Options()
self.options.headless = True
if kwargs.get("proxy"):
self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
if kwargs.get("user_agent"):
self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
else:
user_agent = kwargs.get(
"user_agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
)
self.options.add_argument(f"--user-agent={user_agent}")
self.options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
self.options.headless = kwargs.get("headless", True)
if self.options.headless:
self.options.add_argument("--headless")
self.options.add_argument("--disable-gpu")
self.options.add_argument("--window-size=1920,1080")
self.options.add_argument("--no-sandbox")
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-blink-features=AutomationControlled")
# self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-gpu")
# self.options.add_argument("--disable-extensions")
# self.options.add_argument("--disable-infobars")
# self.options.add_argument("--disable-logging")
# self.options.add_argument("--disable-popup-blocking")
# self.options.add_argument("--disable-translate")
# self.options.add_argument("--disable-default-apps")
# self.options.add_argument("--disable-background-networking")
# self.options.add_argument("--disable-sync")
# self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
# self.options.add_argument("--disable-browser-side-navigation")
# self.options.add_argument("--dns-prefetch-disable")
# self.options.add_argument("--disable-web-security")
self.options.add_argument("--log-level=3")
self.use_cached_html = use_cached_html
self.use_cached_html = use_cached_html
self.js_code = js_code
self.verbose = kwargs.get("verbose", False)
# Hooks
self.hooks = {
"on_driver_created": None,
"on_user_agent_updated": None,
"before_get_url": None,
"after_get_url": None,
"before_return_html": None,
}
# chromedriver_autoinstaller.install()
# import chromedriver_autoinstaller
# crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
# chromedriver_path = chromedriver_autoinstaller.install()
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
# self.service = Service(chromedriver_autoinstaller.install())
# chromedriver_path = ChromeDriverManager().install()
# self.service = Service(chromedriver_path)
# self.service.log_path = "NUL"
# self.driver = webdriver.Chrome(service=self.service, options=self.options)
# Use selenium-manager (built into Selenium 4.10.0+)
self.service = Service()
self.driver = webdriver.Chrome(options=self.options)
self.driver = self.execute_hook("on_driver_created", self.driver)
if kwargs.get("cookies"):
for cookie in kwargs.get("cookies"):
self.driver.add_cookie(cookie)
def set_hook(self, hook_type: str, hook: Callable):
if hook_type in self.hooks:
self.hooks[hook_type] = hook
else:
raise ValueError(f"Invalid hook type: {hook_type}")
def execute_hook(self, hook_type: str, *args):
hook = self.hooks.get(hook_type)
if hook:
result = hook(*args)
if result is not None:
if isinstance(result, webdriver.Chrome):
return result
else:
raise TypeError(
f"Hook {hook_type} must return an instance of webdriver.Chrome or None."
)
# If the hook returns None or there is no hook, return self.driver
return self.driver
def update_user_agent(self, user_agent: str):
self.options.add_argument(f"user-agent={user_agent}")
self.driver.quit()
self.driver = webdriver.Chrome(service=self.service, options=self.options)
self.driver = self.execute_hook("on_user_agent_updated", self.driver)
def set_custom_headers(self, headers: dict):
# Enable Network domain for sending headers
self.driver.execute_cdp_cmd("Network.enable", {})
# Set extra HTTP headers
self.driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": headers})
def _ensure_page_load(self, max_checks=6, check_interval=0.01):
initial_length = len(self.driver.page_source)
for ix in range(max_checks):
# print(f"Checking page load: {ix}")
time.sleep(check_interval)
current_length = len(self.driver.page_source)
if current_length != initial_length:
break
return self.driver.page_source
def crawl(self, url: str, **kwargs) -> str:
# Create md5 hash of the URL
import hashlib
url_hash = hashlib.md5(url.encode()).hexdigest()
if self.use_cached_html:
cache_file_path = os.path.join(
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()),
".crawl4ai",
"cache",
url_hash,
)
if os.path.exists(cache_file_path):
with open(cache_file_path, "r") as f:
return sanitize_input_encode(f.read())
try:
self.driver = self.execute_hook("before_get_url", self.driver)
if self.verbose:
print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
self.driver.get(url) # <html><head></head><body></body></html>
WebDriverWait(self.driver, 20).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
WebDriverWait(self.driver, 10).until(
EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
)
self.driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);"
)
self.driver = self.execute_hook("after_get_url", self.driver)
html = sanitize_input_encode(
self._ensure_page_load()
) # self.driver.page_source
can_not_be_done_headless = (
False # Look at my creativity for naming variables
)
# TODO: Very ugly approach, but promise to change it!
if (
kwargs.get("bypass_headless", False)
or html == "<html><head></head><body></body></html>"
):
print(
"[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode..."
)
can_not_be_done_headless = True
options = Options()
options.headless = False
# set window size very small
options.add_argument("--window-size=5,5")
driver = webdriver.Chrome(service=self.service, options=options)
driver.get(url)
self.driver = self.execute_hook("after_get_url", driver)
html = sanitize_input_encode(driver.page_source)
driver.quit()
# Execute JS code if provided
self.js_code = kwargs.get("js_code", self.js_code)
if self.js_code and type(self.js_code) == str:
self.driver.execute_script(self.js_code)
# Optionally, wait for some condition after executing the JS code
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState")
== "complete"
)
elif self.js_code and type(self.js_code) == list:
for js in self.js_code:
self.driver.execute_script(js)
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script(
"return document.readyState"
)
== "complete"
)
# Optionally, wait for some condition after executing the JS code : Contributed by (https://github.com/jonymusky)
wait_for = kwargs.get("wait_for", False)
if wait_for:
if callable(wait_for):
print("[LOG] 🔄 Waiting for condition...")
WebDriverWait(self.driver, 20).until(wait_for)
else:
print("[LOG] 🔄 Waiting for condition...")
WebDriverWait(self.driver, 20).until(
EC.presence_of_element_located((By.CSS_SELECTOR, wait_for))
)
if not can_not_be_done_headless:
html = sanitize_input_encode(self.driver.page_source)
self.driver = self.execute_hook("before_return_html", self.driver, html)
# Store in cache
cache_file_path = os.path.join(
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()),
".crawl4ai",
"cache",
url_hash,
)
with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(html)
if self.verbose:
print(f"[LOG] ✅ Crawled {url} successfully!")
return html
except InvalidArgumentException as e:
if not hasattr(e, "msg"):
e.msg = sanitize_input_encode(str(e))
raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
except WebDriverException as e:
# If e does nlt have msg attribute create it and set it to str(e)
if not hasattr(e, "msg"):
e.msg = sanitize_input_encode(str(e))
raise WebDriverException(f"Failed to crawl {url}: {e.msg}")
except Exception as e:
if not hasattr(e, "msg"):
e.msg = sanitize_input_encode(str(e))
raise Exception(f"Failed to crawl {url}: {e.msg}")
def take_screenshot(self) -> str:
try:
# Get the dimensions of the page
total_width = self.driver.execute_script("return document.body.scrollWidth")
total_height = self.driver.execute_script(
"return document.body.scrollHeight"
)
# Set the window size to the dimensions of the page
self.driver.set_window_size(total_width, total_height)
# Take screenshot
screenshot = self.driver.get_screenshot_as_png()
# Open the screenshot with PIL
image = Image.open(BytesIO(screenshot))
# Convert image to RGB mode (this will handle both RGB and RGBA images)
rgb_image = image.convert("RGB")
# Convert to JPEG and compress
buffered = BytesIO()
rgb_image.save(buffered, format="JPEG", quality=85)
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
if self.verbose:
print("[LOG] 📸 Screenshot taken and converted to base64")
return img_base64
except Exception as e:
error_message = sanitize_input_encode(
f"Failed to take screenshot: {str(e)}"
)
print(error_message)
# Generate an image with black background
img = Image.new("RGB", (800, 600), color="black")
draw = ImageDraw.Draw(img)
# Load a font
try:
font = ImageFont.truetype("arial.ttf", 40)
except IOError:
font = ImageFont.load_default()
# Define text color and wrap the text
text_color = (255, 255, 255)
max_width = 780
wrapped_text = wrap_text(draw, error_message, font, max_width)
# Calculate text position
text_position = (10, 10)
# Draw the text on the image
draw.text(text_position, wrapped_text, fill=text_color, font=font)
# Convert to base64
buffered = BytesIO()
img.save(buffered, format="JPEG")
img_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
return img_base64
def quit(self):
self.driver.quit()

180
crawl4ai/legacy/database.py Normal file
View File

@@ -0,0 +1,180 @@
import os
from pathlib import Path
import sqlite3
from typing import Optional, Tuple
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
def init_db():
global DB_PATH
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS crawled_data (
url TEXT PRIMARY KEY,
html TEXT,
cleaned_html TEXT,
markdown TEXT,
extracted_content TEXT,
success BOOLEAN,
media TEXT DEFAULT "{}",
links TEXT DEFAULT "{}",
metadata TEXT DEFAULT "{}",
screenshot TEXT DEFAULT ""
)
"""
)
conn.commit()
conn.close()
def alter_db_add_screenshot(new_column: str = "media"):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute(
f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""'
)
conn.commit()
conn.close()
except Exception as e:
print(f"Error altering database to add screenshot column: {e}")
def check_db_path():
if not DB_PATH:
raise ValueError("Database path is not set or is empty.")
def get_cached_url(
url: str,
) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]:
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute(
"SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?",
(url,),
)
result = cursor.fetchone()
conn.close()
return result
except Exception as e:
print(f"Error retrieving cached URL: {e}")
return None
def cache_url(
url: str,
html: str,
cleaned_html: str,
markdown: str,
extracted_content: str,
success: bool,
media: str = "{}",
links: str = "{}",
metadata: str = "{}",
screenshot: str = "",
):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute(
"""
INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(url) DO UPDATE SET
html = excluded.html,
cleaned_html = excluded.cleaned_html,
markdown = excluded.markdown,
extracted_content = excluded.extracted_content,
success = excluded.success,
media = excluded.media,
links = excluded.links,
metadata = excluded.metadata,
screenshot = excluded.screenshot
""",
(
url,
html,
cleaned_html,
markdown,
extracted_content,
success,
media,
links,
metadata,
screenshot,
),
)
conn.commit()
conn.close()
except Exception as e:
print(f"Error caching URL: {e}")
def get_total_count() -> int:
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM crawled_data")
result = cursor.fetchone()
conn.close()
return result[0]
except Exception as e:
print(f"Error getting total count: {e}")
return 0
def clear_db():
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("DELETE FROM crawled_data")
conn.commit()
conn.close()
except Exception as e:
print(f"Error clearing database: {e}")
def flush_db():
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("DROP TABLE crawled_data")
conn.commit()
conn.close()
except Exception as e:
print(f"Error flushing database: {e}")
def update_existing_records(new_column: str = "media", default_value: str = "{}"):
check_db_path()
try:
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute(
f'UPDATE crawled_data SET {new_column} = "{default_value}" WHERE screenshot IS NULL'
)
conn.commit()
conn.close()
except Exception as e:
print(f"Error updating existing records: {e}")
if __name__ == "__main__":
# Delete the existing database file
if os.path.exists(DB_PATH):
os.remove(DB_PATH)
init_db()
# alter_db_add_screenshot("COL_NAME")

View File

@@ -0,0 +1,75 @@
import requests
import shutil
from pathlib import Path
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.llmtxt import AsyncLLMTextManager
class DocsManager:
def __init__(self, logger=None):
self.docs_dir = Path.home() / ".crawl4ai" / "docs"
self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt"
self.docs_dir.mkdir(parents=True, exist_ok=True)
self.logger = logger or AsyncLogger(verbose=True)
self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger)
async def ensure_docs_exist(self):
"""Fetch docs if not present"""
if not any(self.docs_dir.iterdir()):
await self.fetch_docs()
async def fetch_docs(self) -> bool:
"""Copy from local docs or download from GitHub"""
try:
# Try local first
if self.local_docs.exists() and (
any(self.local_docs.glob("*.md"))
or any(self.local_docs.glob("*.tokens"))
):
# Empty the local docs directory
for file_path in self.docs_dir.glob("*.md"):
file_path.unlink()
# for file_path in self.docs_dir.glob("*.tokens"):
# file_path.unlink()
for file_path in self.local_docs.glob("*.md"):
shutil.copy2(file_path, self.docs_dir / file_path.name)
# for file_path in self.local_docs.glob("*.tokens"):
# shutil.copy2(file_path, self.docs_dir / file_path.name)
return True
# Fallback to GitHub
response = requests.get(
"https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt",
headers={"Accept": "application/vnd.github.v3+json"},
)
response.raise_for_status()
for item in response.json():
if item["type"] == "file" and item["name"].endswith(".md"):
content = requests.get(item["download_url"]).text
with open(self.docs_dir / item["name"], "w", encoding="utf-8") as f:
f.write(content)
return True
except Exception as e:
self.logger.error(f"Failed to fetch docs: {str(e)}")
raise
def list(self) -> list[str]:
"""List available topics"""
names = [file_path.stem for file_path in self.docs_dir.glob("*.md")]
# Remove [0-9]+_ prefix
names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names]
# Exclude those end with .xs.md and .q.md
names = [
name
for name in names
if not name.endswith(".xs") and not name.endswith(".q")
]
return names
def generate(self, sections, mode="extended"):
return self.llm_text.generate(sections, mode)
def search(self, query: str, top_k: int = 5):
return self.llm_text.search(query, top_k)

546
crawl4ai/legacy/llmtxt.py Normal file
View File

@@ -0,0 +1,546 @@
import os
from pathlib import Path
import re
from typing import Dict, List, Tuple, Optional, Any
import json
from tqdm import tqdm
import time
import psutil
import numpy as np
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from litellm import batch_completion
from .async_logger import AsyncLogger
import litellm
import pickle
import hashlib # <--- ADDED for file-hash
import glob
litellm.set_verbose = False
def _compute_file_hash(file_path: Path) -> str:
"""Compute MD5 hash for the file's entire content."""
hash_md5 = hashlib.md5()
with file_path.open("rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
class AsyncLLMTextManager:
def __init__(
self,
docs_dir: Path,
logger: Optional[AsyncLogger] = None,
max_concurrent_calls: int = 5,
batch_size: int = 3,
) -> None:
self.docs_dir = docs_dir
self.logger = logger
self.max_concurrent_calls = max_concurrent_calls
self.batch_size = batch_size
self.bm25_index = None
self.document_map: Dict[str, Any] = {}
self.tokenized_facts: List[str] = []
self.bm25_index_file = self.docs_dir / "bm25_index.pkl"
async def _process_document_batch(self, doc_batch: List[Path]) -> None:
"""Process a batch of documents in parallel"""
contents = []
for file_path in doc_batch:
try:
with open(file_path, "r", encoding="utf-8") as f:
contents.append(f.read())
except Exception as e:
self.logger.error(f"Error reading {file_path}: {str(e)}")
contents.append("") # Add empty content to maintain batch alignment
prompt = """Given a documentation file, generate a list of atomic facts where each fact:
1. Represents a single piece of knowledge
2. Contains variations in terminology for the same concept
3. References relevant code patterns if they exist
4. Is written in a way that would match natural language queries
Each fact should follow this format:
<main_concept>: <fact_statement> | <related_terms> | <code_reference>
Example Facts:
browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True)
redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0)
pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5]
Wrap your response in <index>...</index> tags.
"""
# Prepare messages for batch processing
messages_list = [
[
{
"role": "user",
"content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}",
}
]
for content in contents
if content
]
try:
responses = batch_completion(
model="anthropic/claude-3-5-sonnet-latest",
messages=messages_list,
logger_fn=None,
)
# Process responses and save index files
for response, file_path in zip(responses, doc_batch):
try:
index_content_match = re.search(
r"<index>(.*?)</index>",
response.choices[0].message.content,
re.DOTALL,
)
if not index_content_match:
self.logger.warning(
f"No <index>...</index> content found for {file_path}"
)
continue
index_content = re.sub(
r"\n\s*\n", "\n", index_content_match.group(1)
).strip()
if index_content:
index_file = file_path.with_suffix(".q.md")
with open(index_file, "w", encoding="utf-8") as f:
f.write(index_content)
self.logger.info(f"Created index file: {index_file}")
else:
self.logger.warning(
f"No index content found in response for {file_path}"
)
except Exception as e:
self.logger.error(
f"Error processing response for {file_path}: {str(e)}"
)
except Exception as e:
self.logger.error(f"Error in batch completion: {str(e)}")
def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]:
if "|" not in line:
return False, "Missing separator '|'"
parts = [p.strip() for p in line.split("|")]
if len(parts) != 3:
return False, f"Expected 3 parts, got {len(parts)}"
concept_part = parts[0]
if ":" not in concept_part:
return False, "Missing ':' in concept definition"
return True, None
def _load_or_create_token_cache(self, fact_file: Path) -> Dict:
"""
Load token cache from .q.tokens if present and matching file hash.
Otherwise return a new structure with updated file-hash.
"""
cache_file = fact_file.with_suffix(".q.tokens")
current_hash = _compute_file_hash(fact_file)
if cache_file.exists():
try:
with open(cache_file, "r") as f:
cache = json.load(f)
# If the hash matches, return it directly
if cache.get("content_hash") == current_hash:
return cache
# Otherwise, we signal that it's changed
self.logger.info(f"Hash changed for {fact_file}, reindex needed.")
except json.JSONDecodeError:
self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.")
except Exception as e:
self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}")
# Return a fresh cache
return {"facts": {}, "content_hash": current_hash}
def _save_token_cache(self, fact_file: Path, cache: Dict) -> None:
cache_file = fact_file.with_suffix(".q.tokens")
# Always ensure we're saving the correct file-hash
cache["content_hash"] = _compute_file_hash(fact_file)
with open(cache_file, "w") as f:
json.dump(cache, f)
def preprocess_text(self, text: str) -> List[str]:
parts = [x.strip() for x in text.split("|")] if "|" in text else [text]
# Remove : after the first word of parts[0]
parts[0] = re.sub(r"^(.*?):", r"\1", parts[0])
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english")) - {
"how",
"what",
"when",
"where",
"why",
"which",
}
tokens = []
for part in parts:
if "(" in part and ")" in part:
code_tokens = re.findall(
r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part
)
tokens.extend(code_tokens)
words = word_tokenize(part.lower())
tokens.extend(
[
lemmatizer.lemmatize(token)
for token in words
if token not in stop_words
]
)
return tokens
def maybe_load_bm25_index(self, clear_cache=False) -> bool:
"""
Load existing BM25 index from disk, if present and clear_cache=False.
"""
if not clear_cache and os.path.exists(self.bm25_index_file):
self.logger.info("Loading existing BM25 index from disk.")
with open(self.bm25_index_file, "rb") as f:
data = pickle.load(f)
self.tokenized_facts = data["tokenized_facts"]
self.bm25_index = data["bm25_index"]
return True
return False
def build_search_index(self, clear_cache=False) -> None:
"""
Checks for new or modified .q.md files by comparing file-hash.
If none need reindexing and clear_cache is False, loads existing index if available.
Otherwise, reindexes only changed/new files and merges or creates a new index.
"""
# If clear_cache is True, we skip partial logic: rebuild everything from scratch
if clear_cache:
self.logger.info("Clearing cache and rebuilding full search index.")
if self.bm25_index_file.exists():
self.bm25_index_file.unlink()
process = psutil.Process()
self.logger.info("Checking which .q.md files need (re)indexing...")
# Gather all .q.md files
q_files = [
self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")
]
# We'll store known (unchanged) facts in these lists
existing_facts: List[str] = []
existing_tokens: List[List[str]] = []
# Keep track of invalid lines for logging
invalid_lines = []
needSet = [] # files that must be (re)indexed
for qf in q_files:
token_cache_file = qf.with_suffix(".q.tokens")
# If no .q.tokens or clear_cache is True → definitely reindex
if clear_cache or not token_cache_file.exists():
needSet.append(qf)
continue
# Otherwise, load the existing cache and compare hash
cache = self._load_or_create_token_cache(qf)
# If the .q.tokens was out of date (i.e. changed hash), we reindex
if len(cache["facts"]) == 0 or cache.get(
"content_hash"
) != _compute_file_hash(qf):
needSet.append(qf)
else:
# File is unchanged → retrieve cached token data
for line, cache_data in cache["facts"].items():
existing_facts.append(line)
existing_tokens.append(cache_data["tokens"])
self.document_map[line] = qf # track the doc for that fact
if not needSet and not clear_cache:
# If no file needs reindexing, try loading existing index
if self.maybe_load_bm25_index(clear_cache=False):
self.logger.info(
"No new/changed .q.md files found. Using existing BM25 index."
)
return
else:
# If there's no existing index, we must build a fresh index from the old caches
self.logger.info(
"No existing BM25 index found. Building from cached facts."
)
if existing_facts:
self.logger.info(
f"Building BM25 index with {len(existing_facts)} cached facts."
)
self.bm25_index = BM25Okapi(existing_tokens)
self.tokenized_facts = existing_facts
with open(self.bm25_index_file, "wb") as f:
pickle.dump(
{
"bm25_index": self.bm25_index,
"tokenized_facts": self.tokenized_facts,
},
f,
)
else:
self.logger.warning("No facts found at all. Index remains empty.")
return
# ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md'
# If we reach here, we have new or changed .q.md files
# We'll parse them, reindex them, and then combine with existing_facts
# -----------------------------------------------------
self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...")
# 1) Parse the new or changed .q.md files
new_facts = []
new_tokens = []
with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar:
for file in needSet:
# We'll build up a fresh cache
fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)}
try:
with open(file, "r", encoding="utf-8") as f_obj:
content = f_obj.read().strip()
lines = [l.strip() for l in content.split("\n") if l.strip()]
for line in lines:
is_valid, error = self._validate_fact_line(line)
if not is_valid:
invalid_lines.append((file, line, error))
continue
tokens = self.preprocess_text(line)
fresh_cache["facts"][line] = {
"tokens": tokens,
"added": time.time(),
}
new_facts.append(line)
new_tokens.append(tokens)
self.document_map[line] = file
# Save the new .q.tokens with updated hash
self._save_token_cache(file, fresh_cache)
mem_usage = process.memory_info().rss / 1024 / 1024
self.logger.debug(
f"Memory usage after {file.name}: {mem_usage:.2f}MB"
)
except Exception as e:
self.logger.error(f"Error processing {file}: {str(e)}")
file_pbar.update(1)
if invalid_lines:
self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:")
for file, line, error in invalid_lines:
self.logger.warning(f"{file}: {error} in line: {line[:50]}...")
# 2) Merge newly tokenized facts with the existing ones
all_facts = existing_facts + new_facts
all_tokens = existing_tokens + new_tokens
# 3) Build BM25 index from combined facts
self.logger.info(
f"Building BM25 index with {len(all_facts)} total facts (old + new)."
)
self.bm25_index = BM25Okapi(all_tokens)
self.tokenized_facts = all_facts
# 4) Save the updated BM25 index to disk
with open(self.bm25_index_file, "wb") as f:
pickle.dump(
{
"bm25_index": self.bm25_index,
"tokenized_facts": self.tokenized_facts,
},
f,
)
final_mem = process.memory_info().rss / 1024 / 1024
self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB")
async def generate_index_files(
self, force_generate_facts: bool = False, clear_bm25_cache: bool = False
) -> None:
"""
Generate index files for all documents in parallel batches
Args:
force_generate_facts (bool): If True, regenerate indexes even if they exist
clear_bm25_cache (bool): If True, clear existing BM25 index cache
"""
self.logger.info("Starting index generation for documentation files.")
md_files = [
self.docs_dir / f
for f in os.listdir(self.docs_dir)
if f.endswith(".md") and not any(f.endswith(x) for x in [".q.md", ".xs.md"])
]
# Filter out files that already have .q files unless force=True
if not force_generate_facts:
md_files = [
f
for f in md_files
if not (self.docs_dir / f.name.replace(".md", ".q.md")).exists()
]
if not md_files:
self.logger.info("All index files exist. Use force=True to regenerate.")
else:
# Process documents in batches
for i in range(0, len(md_files), self.batch_size):
batch = md_files[i : i + self.batch_size]
self.logger.info(
f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}"
)
await self._process_document_batch(batch)
self.logger.info("Index generation complete, building/updating search index.")
self.build_search_index(clear_cache=clear_bm25_cache)
def generate(self, sections: List[str], mode: str = "extended") -> str:
# Get all markdown files
all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + glob.glob(
str(self.docs_dir / "[0-9]*.xs.md")
)
# Extract base names without extensions
base_docs = {
Path(f).name.split(".")[0]
for f in all_files
if not Path(f).name.endswith(".q.md")
}
# Filter by sections if provided
if sections:
base_docs = {
doc
for doc in base_docs
if any(section.lower() in doc.lower() for section in sections)
}
# Get file paths based on mode
files = []
for doc in sorted(
base_docs,
key=lambda x: int(x.split("_")[0]) if x.split("_")[0].isdigit() else 999999,
):
if mode == "condensed":
xs_file = self.docs_dir / f"{doc}.xs.md"
regular_file = self.docs_dir / f"{doc}.md"
files.append(str(xs_file if xs_file.exists() else regular_file))
else:
files.append(str(self.docs_dir / f"{doc}.md"))
# Read and format content
content = []
for file in files:
try:
with open(file, "r", encoding="utf-8") as f:
fname = Path(file).name
content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}")
except Exception as e:
self.logger.error(f"Error reading {file}: {str(e)}")
return "\n\n---\n\n".join(content) if content else ""
def search(self, query: str, top_k: int = 5) -> str:
if not self.bm25_index:
return "No search index available. Call build_search_index() first."
query_tokens = self.preprocess_text(query)
doc_scores = self.bm25_index.get_scores(query_tokens)
mean_score = np.mean(doc_scores)
std_score = np.std(doc_scores)
score_threshold = mean_score + (0.25 * std_score)
file_data = self._aggregate_search_scores(
doc_scores=doc_scores,
score_threshold=score_threshold,
query_tokens=query_tokens,
)
ranked_files = sorted(
file_data.items(),
key=lambda x: (
x[1]["code_match_score"] * 2.0
+ x[1]["match_count"] * 1.5
+ x[1]["total_score"]
),
reverse=True,
)[:top_k]
results = []
for file, _ in ranked_files:
main_doc = str(file).replace(".q.md", ".md")
if os.path.exists(self.docs_dir / main_doc):
with open(self.docs_dir / main_doc, "r", encoding="utf-8") as f:
only_file_name = main_doc.split("/")[-1]
content = ["#" * 20, f"# {only_file_name}", "#" * 20, "", f.read()]
results.append("\n".join(content))
return "\n\n---\n\n".join(results)
def _aggregate_search_scores(
self, doc_scores: List[float], score_threshold: float, query_tokens: List[str]
) -> Dict:
file_data = {}
for idx, score in enumerate(doc_scores):
if score <= score_threshold:
continue
fact = self.tokenized_facts[idx]
file_path = self.document_map[fact]
if file_path not in file_data:
file_data[file_path] = {
"total_score": 0,
"match_count": 0,
"code_match_score": 0,
"matched_facts": [],
}
components = fact.split("|") if "|" in fact else [fact]
code_match_score = 0
if len(components) == 3:
code_ref = components[2].strip()
code_tokens = self.preprocess_text(code_ref)
code_match_score = len(set(query_tokens) & set(code_tokens)) / len(
query_tokens
)
file_data[file_path]["total_score"] += score
file_data[file_path]["match_count"] += 1
file_data[file_path]["code_match_score"] = max(
file_data[file_path]["code_match_score"], code_match_score
)
file_data[file_path]["matched_facts"].append(fact)
return file_data
def refresh_index(self) -> None:
"""Convenience method for a full rebuild."""
self.build_search_index(clear_cache=True)

View File

@@ -0,0 +1,29 @@
# version_manager.py
from pathlib import Path
from packaging import version
from . import __version__
class VersionManager:
def __init__(self):
self.home_dir = Path.home() / ".crawl4ai"
self.version_file = self.home_dir / "version.txt"
def get_installed_version(self):
"""Get the version recorded in home directory"""
if not self.version_file.exists():
return None
try:
return version.parse(self.version_file.read_text().strip())
except:
return None
def update_version(self):
"""Update the version file to current library version"""
self.version_file.write_text(__version__.__version__)
def needs_update(self):
"""Check if database needs update based on version"""
installed = self.get_installed_version()
current = version.parse(__version__.__version__)
return installed is None or installed < current

View File

@@ -0,0 +1,294 @@
import os, time
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from pathlib import Path
from .models import UrlModel, CrawlResult
from .database import init_db, get_cached_url, cache_url
from .utils import *
from .chunking_strategy import *
from .extraction_strategy import *
from .crawler_strategy import *
from typing import List
from concurrent.futures import ThreadPoolExecutor
from .content_scraping_strategy import WebScrapingStrategy
from .config import *
import warnings
import json
warnings.filterwarnings(
"ignore",
message='Field "model_name" has conflict with protected namespace "model_".',
)
class WebCrawler:
def __init__(
self,
crawler_strategy: CrawlerStrategy = None,
always_by_pass_cache: bool = False,
verbose: bool = False,
):
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(
verbose=verbose
)
self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
)
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
init_db()
self.ready = False
def warmup(self):
print("[LOG] 🌤️ Warming up the WebCrawler")
self.run(
url="https://google.com/",
word_count_threshold=5,
extraction_strategy=NoExtractionStrategy(),
bypass_cache=False,
verbose=False,
)
self.ready = True
print("[LOG] 🌞 WebCrawler is ready to crawl")
def fetch_page(
self,
url_model: UrlModel,
provider: str = DEFAULT_PROVIDER,
api_token: str = None,
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
css_selector: str = None,
screenshot: bool = False,
use_cached_html: bool = False,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs,
) -> CrawlResult:
return self.run(
url_model.url,
word_count_threshold,
extraction_strategy or NoExtractionStrategy(),
chunking_strategy,
bypass_cache=url_model.forced,
css_selector=css_selector,
screenshot=screenshot,
**kwargs,
)
pass
def fetch_pages(
self,
url_models: List[UrlModel],
provider: str = DEFAULT_PROVIDER,
api_token: str = None,
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
use_cached_html: bool = False,
css_selector: str = None,
screenshot: bool = False,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs,
) -> List[CrawlResult]:
extraction_strategy = extraction_strategy or NoExtractionStrategy()
def fetch_page_wrapper(url_model, *args, **kwargs):
return self.fetch_page(url_model, *args, **kwargs)
with ThreadPoolExecutor() as executor:
results = list(
executor.map(
fetch_page_wrapper,
url_models,
[provider] * len(url_models),
[api_token] * len(url_models),
[extract_blocks_flag] * len(url_models),
[word_count_threshold] * len(url_models),
[css_selector] * len(url_models),
[screenshot] * len(url_models),
[use_cached_html] * len(url_models),
[extraction_strategy] * len(url_models),
[chunking_strategy] * len(url_models),
*[kwargs] * len(url_models),
)
)
return results
def run(
self,
url: str,
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
screenshot: bool = False,
user_agent: str = None,
verbose=True,
**kwargs,
) -> CrawlResult:
try:
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy")
if not isinstance(chunking_strategy, ChunkingStrategy):
raise ValueError("Unsupported chunking strategy")
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
cached = None
screenshot_data = None
extracted_content = None
if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url)
if kwargs.get("warmup", True) and not self.ready:
return None
if cached:
html = sanitize_input_encode(cached[1])
extracted_content = sanitize_input_encode(cached[4])
if screenshot:
screenshot_data = cached[9]
if not screenshot_data:
cached = None
if not cached or not html:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time()
html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
t2 = time.time()
if verbose:
print(
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
)
if screenshot:
screenshot_data = self.crawler_strategy.take_screenshot()
crawl_result = self.process_html(
url,
html,
extracted_content,
word_count_threshold,
extraction_strategy,
chunking_strategy,
css_selector,
screenshot_data,
verbose,
bool(cached),
**kwargs,
)
crawl_result.success = bool(html)
return crawl_result
except Exception as e:
if not hasattr(e, "msg"):
e.msg = str(e)
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")
return CrawlResult(url=url, html="", success=False, error_message=e.msg)
def process_html(
self,
url: str,
html: str,
extracted_content: str,
word_count_threshold: int,
extraction_strategy: ExtractionStrategy,
chunking_strategy: ChunkingStrategy,
css_selector: str,
screenshot: bool,
verbose: bool,
is_cached: bool,
**kwargs,
) -> CrawlResult:
t = time.time()
# Extract content from HTML
try:
t1 = time.time()
scrapping_strategy = WebScrapingStrategy()
extra_params = {
k: v
for k, v in kwargs.items()
if k not in ["only_text", "image_description_min_word_threshold"]
}
result = scrapping_strategy.scrap(
url,
html,
word_count_threshold=word_count_threshold,
css_selector=css_selector,
only_text=kwargs.get("only_text", False),
image_description_min_word_threshold=kwargs.get(
"image_description_min_word_threshold",
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
),
**extra_params,
)
# result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
if verbose:
print(
f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
)
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e:
raise ValueError(str(e))
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})
if extracted_content is None:
if verbose:
print(
f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}"
)
sections = chunking_strategy.chunk(markdown)
extracted_content = extraction_strategy.run(url, sections)
extracted_content = json.dumps(
extracted_content, indent=4, default=str, ensure_ascii=False
)
if verbose:
print(
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
)
screenshot = None if not screenshot else screenshot
if not is_cached:
cache_url(
url,
html,
cleaned_html,
markdown,
extracted_content,
True,
json.dumps(media),
json.dumps(links),
json.dumps(metadata),
screenshot=screenshot,
)
return CrawlResult(
url=url,
html=html,
cleaned_html=format_html(cleaned_html),
markdown=markdown,
media=media,
links=links,
metadata=metadata,
screenshot=screenshot,
extracted_content=extracted_content,
success=True,
error_message="",
)