From d5ed4512990a4baae31b63f11c345b3bbd65753c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 25 Dec 2024 21:34:31 +0800 Subject: [PATCH] Enhance crawler capabilities and documentation - Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation. --- .codeiumignore | 220 ++++++ .gitignore | 1 + crawl4ai/async_configs.py | 137 ++-- crawl4ai/async_crawler_strategy.py | 24 +- crawl4ai/async_tools.py | 183 ----- crawl4ai/async_webcrawler.py | 129 +++- crawl4ai/chunking_strategy.py | 8 +- crawl4ai/cli.py | 98 +-- crawl4ai/content_scraping_strategy.py | 106 ++- crawl4ai/docs_manager.py | 92 +-- crawl4ai/extraction_strategy.py | 65 +- crawl4ai/llmtxt.py | 640 +++++++++++++----- crawl4ai/models.py | 19 +- crawl4ai/utilities/cert_exporter.py | 156 +++++ crawl4ai/utilities/ssl_utils.py | 83 +++ crawl4ai/utils.py | 61 +- docs/examples/browser_optimization_example.py | 128 ++++ .../examples/extraction_strategies_example.py | 115 ++++ .../full_page_screenshot_and_pdf_export.md | 4 +- docs/llm.txt/10_file_download.q.md | 73 +- docs/llm.txt/11_page_interaction.q.md | 74 +- docs/llm.txt/12_prefix_based_input.q.md | 66 +- docs/llm.txt/13_hooks_auth.q.md | 70 +- docs/llm.txt/14_proxy_security.q.md | 61 +- docs/llm.txt/15_screenshot_and_pdf_export.md | 4 +- .../llm.txt/15_screenshot_and_pdf_export.q.md | 59 +- docs/llm.txt/16_storage_state.q.md | 10 + docs/llm.txt/16_storage_state_q.md | 52 -- docs/llm.txt/17_crawl_config.q.md | 17 + ...1_introduction.ex.md => 1_introduction.md} | 4 +- docs/llm.txt/1_introduction.q.md | 16 + docs/llm.txt/1_introduction.xs.q.md | 13 + docs/llm.txt/23_common_issues.md | 0 docs/llm.txt/2_configuration.q.md | 117 +--- ...webcrawler.ex.md => 3_async_webcrawler.md} | 0 docs/llm.txt/3_async_webcrawler.q.md | 96 +-- docs/llm.txt/3_async_webcrawler.xs.q.md | 12 + ...t_page.ex.md => 4_browser_context_page.md} | 0 docs/llm.txt/4_browser_context_page.q.md | 72 +- ...age.sm.md => 4_browser_context_page.xs.md} | 0 ...eration.ex.md => 5_markdown_generation.md} | 24 + docs/llm.txt/5_markdown_generation.q.md | 68 +- ...tion.sm.md => 5_markdown_generation.xs.md} | 0 docs/llm.txt/6_chunking_strategies.q.md | 63 +- ...egies.ex.md => 7_extraction_strategies.md} | 27 + docs/llm.txt/7_extraction_strategies.q.md | 86 +-- ...es.sm.md => 7_extraction_strategies.xs.md} | 25 +- ...selection.ex.md => 8_content_selection.md} | 0 docs/llm.txt/8_content_selection.q.md | 87 +-- ...ection.sm.md => 8_content_selection.xs.md} | 0 docs/llm.txt/9_cache_modes.q.md | 68 +- docs/llm.txt/llmtxt.py | 187 ----- docs/md_v2/extraction/overview.md | 29 + examples/save_certificate.py | 49 ++ examples/ssl_certificate_example.py | 67 ++ main.py | 11 +- requirements.txt | 3 +- tests/test_cli_docs.py | 43 ++ tests/test_llmtxt.py | 49 ++ 59 files changed, 2208 insertions(+), 1763 deletions(-) create mode 100644 .codeiumignore delete mode 100644 crawl4ai/async_tools.py create mode 100644 crawl4ai/utilities/cert_exporter.py create mode 100644 crawl4ai/utilities/ssl_utils.py create mode 100644 docs/examples/browser_optimization_example.py create mode 100644 docs/examples/extraction_strategies_example.py create mode 100644 docs/llm.txt/16_storage_state.q.md delete mode 100644 docs/llm.txt/16_storage_state_q.md create mode 100644 docs/llm.txt/17_crawl_config.q.md rename docs/llm.txt/{1_introduction.ex.md => 1_introduction.md} (99%) create mode 100644 docs/llm.txt/1_introduction.q.md create mode 100644 docs/llm.txt/1_introduction.xs.q.md delete mode 100644 docs/llm.txt/23_common_issues.md rename docs/llm.txt/{3_async_webcrawler.ex.md => 3_async_webcrawler.md} (100%) create mode 100644 docs/llm.txt/3_async_webcrawler.xs.q.md rename docs/llm.txt/{4_browser_context_page.ex.md => 4_browser_context_page.md} (100%) rename docs/llm.txt/{4_browser_context_page.sm.md => 4_browser_context_page.xs.md} (100%) rename docs/llm.txt/{5_markdown_generation.ex.md => 5_markdown_generation.md} (96%) rename docs/llm.txt/{5_markdown_generation.sm.md => 5_markdown_generation.xs.md} (100%) rename docs/llm.txt/{7_extraction_strategies.ex.md => 7_extraction_strategies.md} (95%) rename docs/llm.txt/{7_extraction_strategies.sm.md => 7_extraction_strategies.xs.md} (77%) rename docs/llm.txt/{8_content_selection.ex.md => 8_content_selection.md} (100%) rename docs/llm.txt/{8_content_selection.sm.md => 8_content_selection.xs.md} (100%) delete mode 100644 docs/llm.txt/llmtxt.py create mode 100644 examples/save_certificate.py create mode 100644 examples/ssl_certificate_example.py create mode 100644 tests/test_cli_docs.py create mode 100644 tests/test_llmtxt.py diff --git a/.codeiumignore b/.codeiumignore new file mode 100644 index 00000000..76ff6caa --- /dev/null +++ b/.codeiumignore @@ -0,0 +1,220 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +Crawl4AI.egg-info/ +Crawl4AI.egg-info/* +crawler_data.db +.vscode/ +.tests/ +.test_pads/ +test_pad.py +test_pad*.py +.data/ +Crawl4AI.egg-info/ + +requirements0.txt +a.txt + +*.sh +.idea +docs/examples/.chainlit/ +docs/examples/.chainlit/* +.chainlit/config.toml +.chainlit/translations/en-US.json + +local/ +.files/ + +a.txt +.lambda_function.py +ec2* + +update_changelog.sh + +.DS_Store +docs/.DS_Store +tmp/ +test_env/ +**/.DS_Store +**/.DS_Store + +todo.md +todo_executor.md +git_changes.py +git_changes.md +pypi_build.sh +git_issues.py +git_issues.md + +.next/ +.tests/ +.docs/ +.gitboss/ +todo_executor.md +protect-all-except-feature.sh +manage-collab.sh +publish.sh +combine.sh +combined_output.txt +tree.md + diff --git a/.gitignore b/.gitignore index d485815c..012e78cb 100644 --- a/.gitignore +++ b/.gitignore @@ -206,6 +206,7 @@ pypi_build.sh git_issues.py git_issues.md +.local/ .next/ .tests/ .issues/ diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 90aff709..39b6e690 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -207,6 +207,8 @@ class CrawlerRunConfig: Default: None. excluded_tags (list of str or None): List of HTML tags to exclude from processing. Default: None. + excluded_selector (str or None): CSS selector to exclude from processing. + Default: None. keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. Default: False. remove_forms (bool): If True, remove all `
` elements from the HTML. @@ -316,10 +318,14 @@ class CrawlerRunConfig: only_text: bool = False, css_selector: str = None, excluded_tags: list = None, + excluded_selector: str = None, keep_data_attributes: bool = False, remove_forms: bool = False, prettiify: bool = False, + # SSL Parameters + fetch_ssl_certificate: bool = False, + # Caching Parameters cache_mode=None, session_id: str = None, @@ -383,10 +389,14 @@ class CrawlerRunConfig: self.only_text = only_text self.css_selector = css_selector self.excluded_tags = excluded_tags or [] + self.excluded_selector = excluded_selector or "" self.keep_data_attributes = keep_data_attributes self.remove_forms = remove_forms self.prettiify = prettiify + # SSL Parameters + self.fetch_ssl_certificate = fetch_ssl_certificate + # Caching Parameters self.cache_mode = cache_mode self.session_id = session_id @@ -464,10 +474,14 @@ class CrawlerRunConfig: only_text=kwargs.get("only_text", False), css_selector=kwargs.get("css_selector"), excluded_tags=kwargs.get("excluded_tags", []), + excluded_selector=kwargs.get("excluded_selector", ""), keep_data_attributes=kwargs.get("keep_data_attributes", False), remove_forms=kwargs.get("remove_forms", False), prettiify=kwargs.get("prettiify", False), + # SSL Parameters + fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), + # Caching Parameters cache_mode=kwargs.get("cache_mode"), session_id=kwargs.get("session_id"), @@ -521,70 +535,59 @@ class CrawlerRunConfig: url=kwargs.get("url"), ) - - - - # @staticmethod - # def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": - # return CrawlerRunConfig( - # word_count_threshold=kwargs.get("word_count_threshold", 200), - # extraction_strategy=kwargs.get("extraction_strategy"), - # chunking_strategy=kwargs.get("chunking_strategy"), - # markdown_generator=kwargs.get("markdown_generator"), - # content_filter=kwargs.get("content_filter"), - # cache_mode=kwargs.get("cache_mode"), - # session_id=kwargs.get("session_id"), - # bypass_cache=kwargs.get("bypass_cache", False), - # disable_cache=kwargs.get("disable_cache", False), - # no_cache_read=kwargs.get("no_cache_read", False), - # no_cache_write=kwargs.get("no_cache_write", False), - # css_selector=kwargs.get("css_selector"), - # screenshot=kwargs.get("screenshot", False), - # pdf=kwargs.get("pdf", False), - # verbose=kwargs.get("verbose", True), - # only_text=kwargs.get("only_text", False), - # image_description_min_word_threshold=kwargs.get( - # "image_description_min_word_threshold", - # IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, - # ), - # prettiify=kwargs.get("prettiify", False), - # js_code=kwargs.get( - # "js_code" - # ), # If not provided here, will default inside constructor - # wait_for=kwargs.get("wait_for"), - # js_only=kwargs.get("js_only", False), - # wait_until=kwargs.get("wait_until", "domcontentloaded"), - # page_timeout=kwargs.get("page_timeout", 60000), - # ignore_body_visibility=kwargs.get("ignore_body_visibility", True), - # adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False), - # scan_full_page=kwargs.get("scan_full_page", False), - # scroll_delay=kwargs.get("scroll_delay", 0.2), - # process_iframes=kwargs.get("process_iframes", False), - # remove_overlay_elements=kwargs.get("remove_overlay_elements", False), - # delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), - # log_console=kwargs.get("log_console", False), - # simulate_user=kwargs.get("simulate_user", False), - # override_navigator=kwargs.get("override_navigator", False), - # magic=kwargs.get("magic", False), - # screenshot_wait_for=kwargs.get("screenshot_wait_for"), - # screenshot_height_threshold=kwargs.get( - # "screenshot_height_threshold", 20000 - # ), - # mean_delay=kwargs.get("mean_delay", 0.1), - # max_range=kwargs.get("max_range", 0.3), - # semaphore_count=kwargs.get("semaphore_count", 5), - # image_score_threshold=kwargs.get( - # "image_score_threshold", IMAGE_SCORE_THRESHOLD - # ), - # exclude_social_media_domains=kwargs.get( - # "exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS - # ), - # exclude_external_links=kwargs.get("exclude_external_links", False), - # exclude_social_media_links=kwargs.get("exclude_social_media_links", False), - # exclude_domains=kwargs.get("exclude_domains", []), - # exclude_external_images=kwargs.get("exclude_external_images", False), - # remove_forms=kwargs.get("remove_forms", False), - # keep_data_attributes=kwargs.get("keep_data_attributes", False), - # excluded_tags=kwargs.get("excluded_tags", []), - # ) - + # Create a funciton returns dict of the object + def to_dict(self): + return { + "word_count_threshold": self.word_count_threshold, + "extraction_strategy": self.extraction_strategy, + "chunking_strategy": self.chunking_strategy, + "markdown_generator": self.markdown_generator, + "content_filter": self.content_filter, + "only_text": self.only_text, + "css_selector": self.css_selector, + "excluded_tags": self.excluded_tags, + "excluded_selector": self.excluded_selector, + "keep_data_attributes": self.keep_data_attributes, + "remove_forms": self.remove_forms, + "prettiify": self.prettiify, + "fetch_ssl_certificate": self.fetch_ssl_certificate, + "cache_mode": self.cache_mode, + "session_id": self.session_id, + "bypass_cache": self.bypass_cache, + "disable_cache": self.disable_cache, + "no_cache_read": self.no_cache_read, + "no_cache_write": self.no_cache_write, + "wait_until": self.wait_until, + "page_timeout": self.page_timeout, + "wait_for": self.wait_for, + "wait_for_images": self.wait_for_images, + "delay_before_return_html": self.delay_before_return_html, + "mean_delay": self.mean_delay, + "max_range": self.max_range, + "semaphore_count": self.semaphore_count, + "js_code": self.js_code, + "js_only": self.js_only, + "ignore_body_visibility": self.ignore_body_visibility, + "scan_full_page": self.scan_full_page, + "scroll_delay": self.scroll_delay, + "process_iframes": self.process_iframes, + "remove_overlay_elements": self.remove_overlay_elements, + "simulate_user": self.simulate_user, + "override_navigator": self.override_navigator, + "magic": self.magic, + "adjust_viewport_to_content": self.adjust_viewport_to_content, + "screenshot": self.screenshot, + "screenshot_wait_for": self.screenshot_wait_for, + "screenshot_height_threshold": self.screenshot_height_threshold, + "pdf": self.pdf, + "image_description_min_word_threshold": self.image_description_min_word_threshold, + "image_score_threshold": self.image_score_threshold, + "exclude_external_images": self.exclude_external_images, + "exclude_social_media_domains": self.exclude_social_media_domains, + "exclude_external_links": self.exclude_external_links, + "exclude_social_media_links": self.exclude_social_media_links, + "exclude_domains": self.exclude_domains, + "verbose": self.verbose, + "log_console": self.log_console, + "url": self.url, + } diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index c89a8513..b7ac54c4 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -23,11 +23,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT from .async_configs import BrowserConfig, CrawlerRunConfig from .async_logger import AsyncLogger from playwright_stealth import StealthConfig, stealth_async - - -from io import BytesIO -import base64 -from PIL import Image, ImageDraw, ImageFont +from .utilities.ssl_utils import get_ssl_certificate stealth_config = StealthConfig( webdriver=True, @@ -566,18 +562,6 @@ class AsyncCrawlerStrategy(ABC): async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: pass - @abstractmethod - async def take_screenshot(self, **kwargs) -> str: - pass - - @abstractmethod - def update_user_agent(self, user_agent: str): - pass - - @abstractmethod - def set_hook(self, hook_type: str, hook: Callable): - pass - class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): def __init__( @@ -928,6 +912,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.on("pageerror", lambda e: log_consol(e, "error")) try: + # Get SSL certificate information if requested and URL is HTTPS + ssl_certificate = None + if config.fetch_ssl_certificate and url.startswith('https://'): + ssl_certificate = get_ssl_certificate(url) + # Set up download handling if self.browser_config.accept_downloads: page.on( @@ -1155,6 +1144,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): screenshot=screenshot_data, pdf_data=pdf_data, get_delayed_content=get_delayed_content, + ssl_certificate=ssl_certificate, downloaded_files=( self._downloaded_files if self._downloaded_files else None ), diff --git a/crawl4ai/async_tools.py b/crawl4ai/async_tools.py deleted file mode 100644 index 157e5596..00000000 --- a/crawl4ai/async_tools.py +++ /dev/null @@ -1,183 +0,0 @@ -import asyncio -import base64 -import time -from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable -import os, sys, shutil -import tempfile, subprocess -from playwright.async_api import async_playwright, Page, Browser, Error -from playwright.async_api import TimeoutError as PlaywrightTimeoutError -from io import BytesIO -from PIL import Image, ImageDraw, ImageFont -from pathlib import Path -from playwright.async_api import ProxySettings -from pydantic import BaseModel -import hashlib -import json -import uuid -from .models import AsyncCrawlResponse -from .utils import create_box_message -from .user_agent_generator import UserAgentGenerator -from playwright_stealth import StealthConfig, stealth_async - - -class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): - self.browser_type = browser_type - self.user_data_dir = user_data_dir - self.headless = headless - self.browser_process = None - self.temp_dir = None - self.debugging_port = debugging_port - self.host = host - self.logger = logger - self.shutting_down = False - - async def start(self) -> str: - """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. - """ - - # Create temp dir if needed - if not self.user_data_dir: - self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") - self.user_data_dir = self.temp_dir - - # Get browser path and args based on OS and browser type - browser_path = self._get_browser_path() - args = self._get_browser_args() - - # Start browser process - try: - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - # Monitor browser process output for errors - asyncio.create_task(self._monitor_browser_process()) - await asyncio.sleep(2) # Give browser time to start - return f"http://{self.host}:{self.debugging_port}" - except Exception as e: - await self.cleanup() - raise Exception(f"Failed to start browser: {e}") - - async def _monitor_browser_process(self): - """Monitor the browser process for unexpected termination.""" - if self.browser_process: - try: - stdout, stderr = await asyncio.gather( - asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read) - ) - - # Check shutting_down flag BEFORE logging anything - if self.browser_process.poll() is not None: - if not self.shutting_down: - self.logger.error( - message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", - tag="ERROR", - params={ - "code": self.browser_process.returncode, - "stdout": stdout.decode(), - "stderr": stderr.decode() - } - ) - await self.cleanup() - else: - self.logger.info( - message="Browser process terminated normally | Code: {code}", - tag="INFO", - params={"code": self.browser_process.returncode} - ) - except Exception as e: - if not self.shutting_down: - self.logger.error( - message="Error monitoring browser process: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - def _get_browser_path(self) -> str: - """Returns the browser executable path based on OS and browser type""" - if sys.platform == "darwin": # macOS - paths = { - "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" - } - elif sys.platform == "win32": # Windows - paths = { - "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None # WebKit not supported on Windows - } - else: # Linux - paths = { - "chromium": "google-chrome", - "firefox": "firefox", - "webkit": None # WebKit not supported on Linux - } - - return paths.get(self.browser_type) - - def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [self._get_browser_path()] - - if self.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.debugging_port}", - f"--user-data-dir={self.user_data_dir}", - ] - if self.headless: - args.append("--headless=new") - elif self.browser_type == "firefox": - args = [ - "--remote-debugging-port", str(self.debugging_port), - "--profile", self.user_data_dir, - ] - if self.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args - - async def cleanup(self): - """Cleanup browser process and temporary directory""" - # Set shutting_down flag BEFORE any termination actions - self.shutting_down = True - - if self.browser_process: - try: - self.browser_process.terminate() - # Wait for process to end gracefully - for _ in range(10): # 10 attempts, 100ms each - if self.browser_process.poll() is not None: - break - await asyncio.sleep(0.1) - - # Force kill if still running - if self.browser_process.poll() is None: - self.browser_process.kill() - await asyncio.sleep(0.1) # Brief wait for kill to take effect - - except Exception as e: - self.logger.error( - message="Error terminating browser: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)} - ) - diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 2036f56f..8cab693b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -42,6 +42,26 @@ class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. + There are two ways to use the crawler: + + 1. Using context manager (recommended for simple cases): + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") + ``` + + 2. Using explicit lifecycle management (recommended for long-running applications): + ```python + crawler = AsyncWebCrawler() + await crawler.start() + + # Use the crawler multiple times + result1 = await crawler.arun(url="https://example.com") + result2 = await crawler.arun(url="https://another.com") + + await crawler.close() + ``` + Migration Guide: Old way (deprecated): crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) @@ -127,16 +147,49 @@ class AsyncWebCrawler: self.ready = False - async def __aenter__(self): + async def start(self): + """ + Start the crawler explicitly without using context manager. + This is equivalent to using 'async with' but gives more control over the lifecycle. + + This method will: + 1. Initialize the browser and context + 2. Perform warmup sequence + 3. Return the crawler instance for method chaining + + Returns: + AsyncWebCrawler: The initialized crawler instance + """ await self.crawler_strategy.__aenter__() await self.awarmup() return self + async def close(self): + """ + Close the crawler explicitly without using context manager. + This should be called when you're done with the crawler if you used start(). + + This method will: + 1. Clean up browser resources + 2. Close any open pages and contexts + """ + await self.crawler_strategy.__aexit__(None, None, None) + + async def __aenter__(self): + return await self.start() + async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) + await self.close() async def awarmup(self): - """Initialize the crawler with warm-up sequence.""" + """ + Initialize the crawler with warm-up sequence. + + This method: + 1. Logs initialization info + 2. Sets up browser configuration + 3. Marks the crawler as ready + """ self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.ready = True @@ -144,7 +197,7 @@ class AsyncWebCrawler: async def nullcontext(self): """异步空上下文管理器""" yield - + async def arun( self, url: str, @@ -204,14 +257,14 @@ class AsyncWebCrawler: try: # Handle configuration if crawler_config is not None: - if any(param is not None for param in [ - word_count_threshold, extraction_strategy, chunking_strategy, - content_filter, cache_mode, css_selector, screenshot, pdf - ]): - self.logger.warning( - message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", - tag="WARNING" - ) + # if any(param is not None for param in [ + # word_count_threshold, extraction_strategy, chunking_strategy, + # content_filter, cache_mode, css_selector, screenshot, pdf + # ]): + # self.logger.warning( + # message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", + # tag="WARNING" + # ) config = crawler_config else: # Merge all parameters into a single kwargs dict for config creation @@ -322,6 +375,7 @@ class AsyncWebCrawler: screenshot=screenshot_data, pdf_data=pdf_data, verbose=config.verbose, + is_raw_html = True if url.startswith("raw:") else False, **kwargs ) @@ -330,9 +384,11 @@ class AsyncWebCrawler: crawl_result.status_code = async_response.status_code crawl_result.response_headers = async_response.response_headers crawl_result.downloaded_files = async_response.downloaded_files + crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate else: crawl_result.status_code = 200 crawl_result.response_headers = cached_result.response_headers if cached_result else {} + crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache crawl_result.success = bool(html) crawl_result.session_id = getattr(config, 'session_id', None) @@ -416,15 +472,20 @@ class AsyncWebCrawler: scrapping_strategy = WebScrapingStrategy(logger=self.logger) # Process HTML content + params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} + # add keys from kwargs to params that doesn't exist in params + params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) + result = scrapping_strategy.scrap( url, html, - word_count_threshold=config.word_count_threshold, - css_selector=config.css_selector, - only_text=config.only_text, - image_description_min_word_threshold=config.image_description_min_word_threshold, - content_filter=config.content_filter, - **kwargs + **params, + # word_count_threshold=config.word_count_threshold, + # css_selector=config.css_selector, + # only_text=config.only_text, + # image_description_min_word_threshold=config.image_description_min_word_threshold, + # content_filter=config.content_filter, + # **kwargs ) if result is None: @@ -476,15 +537,27 @@ class AsyncWebCrawler: t1 = time.perf_counter() - # Handle different extraction strategy types - if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonXPathExtractionStrategy)): - config.extraction_strategy.verbose = verbose - extracted_content = config.extraction_strategy.run(url, [html]) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - else: - sections = config.chunking_strategy.chunk(markdown) - extracted_content = config.extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + # Choose content based on input_format + content_format = config.extraction_strategy.input_format + if content_format == "fit_markdown" and not markdown_result.fit_markdown: + self.logger.warning( + message="Fit markdown requested but not available. Falling back to raw markdown.", + tag="EXTRACT", + params={"url": _url} + ) + content_format = "markdown" + + content = { + "markdown": markdown, + "html": html, + "fit_markdown": markdown_result.raw_markdown + }.get(content_format, markdown) + + # Use IdentityChunking for HTML input, otherwise use provided chunking strategy + chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy + sections = chunking.chunk(content) + extracted_content = config.extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) # Log extraction completion self.logger.info( @@ -683,5 +756,3 @@ class AsyncWebCrawler: async def aget_cache_size(self): """Get the total number of cached items.""" return await async_db_manager.aget_total_count() - - diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index af857947..2af56b32 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -14,7 +14,12 @@ class ChunkingStrategy(ABC): Abstract method to chunk the given text. """ pass - + +# Create an identity chunking strategy f(x) = [x] +class IdentityChunking(ChunkingStrategy): + def chunk(self, text: str) -> list: + return [text] + # Regex-based chunking class RegexChunking(ChunkingStrategy): def __init__(self, patterns=None, **kwargs): @@ -127,7 +132,6 @@ class SlidingWindowChunking(ChunkingStrategy): return chunks - class OverlappingWindowChunking(ChunkingStrategy): def __init__(self, window_size=1000, overlap=100, **kwargs): """ diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index cd5ee633..4a01c1c2 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -1,8 +1,7 @@ import click import sys import asyncio -from pathlib import Path -from typing import List, Optional +from typing import List from .docs_manager import DocsManager from .async_logger import AsyncLogger @@ -10,20 +9,19 @@ logger = AsyncLogger(verbose=True) docs_manager = DocsManager(logger) def print_table(headers: List[str], rows: List[List[str]], padding: int = 2): - """Helper function to print formatted tables""" - col_widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)] - border = '+' + '+'.join('-' * (width + 2 * padding) for width in col_widths) + '+' + """Print formatted table with headers and rows""" + widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)] + border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+' + + def format_row(row): + return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}" + for cell, w in zip(row, widths)) + '|' - def print_row(row): - return '|' + '|'.join( - f"{str(cell):{' '}<{width}}" for cell, width in zip(row, col_widths) - ) + '|' - click.echo(border) - click.echo(print_row(headers)) + click.echo(format_row(headers)) click.echo(border) for row in rows: - click.echo(print_row(row)) + click.echo(format_row(row)) click.echo(border) @click.group() @@ -33,63 +31,75 @@ def cli(): @cli.group() def docs(): - """Documentation and LLM text operations""" + """Documentation operations""" pass @docs.command() @click.argument('sections', nargs=-1) -@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended', - help='Documentation detail level') +@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended') def combine(sections: tuple, mode: str): - """Combine documentation sections. - - If no sections are specified, combines all available sections. - """ + """Combine documentation sections""" try: asyncio.run(docs_manager.ensure_docs_exist()) - result = docs_manager.concatenate_docs(sections, mode) - click.echo(result) + click.echo(docs_manager.generate(sections, mode)) except Exception as e: logger.error(str(e), tag="ERROR") sys.exit(1) @docs.command() @click.argument('query') -@click.option('--top-k', '-k', default=5, help='Number of top results to return') -def search(query: str, top_k: int): - """Search through documentation questions""" +@click.option('--top-k', '-k', default=5) +@click.option('--build-index', is_flag=True, help='Build index if missing') +def search(query: str, top_k: int, build_index: bool): + """Search documentation""" try: - results = docs_manager.search_questions(query, top_k) - click.echo(results) - except Exception as e: - click.echo(f"Error: {str(e)}", err=True) - sys.exit(1) - -@docs.command() -def list(): - """List available documentation sections""" - try: - file_map = docs_manager.get_file_map() - rows = [[num, name] for name, num in file_map.items()] - rows.sort(key=lambda x: int(x[0])) - print_table(['Number', 'Section Name'], rows) + result = docs_manager.search(query, top_k) + if result == "No search index available. Call build_search_index() first.": + if build_index or click.confirm('No search index found. Build it now?'): + asyncio.run(docs_manager.llm_text.generate_index_files()) + result = docs_manager.search(query, top_k) + click.echo(result) except Exception as e: click.echo(f"Error: {str(e)}", err=True) sys.exit(1) @docs.command() def update(): - """Update local documentation cache from GitHub""" + """Update docs from GitHub""" try: - docs_manager = DocsManager() - docs_manager.update_docs() + asyncio.run(docs_manager.fetch_docs()) click.echo("Documentation updated successfully") + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +@docs.command() +@click.option('--force-facts', is_flag=True, help='Force regenerate fact files') +@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache') +def index(force_facts: bool, clear_cache: bool): + """Build or rebuild search indexes""" + try: + asyncio.run(docs_manager.ensure_docs_exist()) + asyncio.run(docs_manager.llm_text.generate_index_files( + force_generate_facts=force_facts, + clear_bm25_cache=clear_cache + )) + click.echo("Search indexes built successfully") + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +# Add docs list command +@docs.command() +def list(): + """List available documentation sections""" + try: + sections = docs_manager.list() + print_table(["Sections"], [[section] for section in sections]) except Exception as e: click.echo(f"Error: {str(e)}", err=True) sys.exit(1) - - if __name__ == '__main__': - cli() + cli() \ No newline at end of file diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 4b0c50a5..35fdba1f 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1,4 +1,5 @@ import re # Point 1: Pre-Compile Regular Expressions +import time from abc import ABC, abstractmethod from typing import Dict, Any, Optional from bs4 import BeautifulSoup @@ -16,7 +17,8 @@ from .models import MarkdownGenerationResult from .utils import ( extract_metadata, normalize_url, - is_external_url + is_external_url, + get_base_domain, ) @@ -341,6 +343,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): # if element.name == 'img': # process_image(element, url, 0, 1) # return True + base_domain = kwargs.get("base_domain", get_base_domain(url)) if element.name in ['script', 'style', 'link', 'meta', 'noscript']: element.decompose() @@ -348,8 +351,10 @@ class WebScrapingStrategy(ContentScrapingStrategy): keep_element = False - exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) - exclude_social_media_domains = list(set(exclude_social_media_domains)) + exclude_domains = kwargs.get('exclude_domains', []) + # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) + # exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) + # exclude_social_media_domains = list(set(exclude_social_media_domains)) try: if element.name == 'a' and element.get('href'): @@ -369,33 +374,43 @@ class WebScrapingStrategy(ContentScrapingStrategy): link_data = { 'href': normalized_href, 'text': element.get_text().strip(), - 'title': element.get('title', '').strip() + 'title': element.get('title', '').strip(), + 'base_domain': base_domain } + + is_external = is_external_url(normalized_href, base_domain) + + keep_element = True - # Check for duplicates and add to appropriate dictionary - is_external = is_external_url(normalized_href, url_base) + # Handle external link exclusions + if is_external: + link_base_domain = get_base_domain(normalized_href) + link_data['base_domain'] = link_base_domain + if kwargs.get('exclude_external_links', False): + element.decompose() + return False + # elif kwargs.get('exclude_social_media_links', False): + # if link_base_domain in exclude_social_media_domains: + # element.decompose() + # return False + # if any(domain in normalized_href.lower() for domain in exclude_social_media_domains): + # element.decompose() + # return False + elif exclude_domains: + if link_base_domain in exclude_domains: + element.decompose() + return False + # if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): + # element.decompose() + # return False + if is_external: if normalized_href not in external_links_dict: external_links_dict[normalized_href] = link_data else: if normalized_href not in internal_links_dict: internal_links_dict[normalized_href] = link_data - - keep_element = True - - # Handle external link exclusions - if is_external: - if kwargs.get('exclude_external_links', False): - element.decompose() - return False - elif kwargs.get('exclude_social_media_links', False): - if any(domain in normalized_href.lower() for domain in exclude_social_media_domains): - element.decompose() - return False - elif kwargs.get('exclude_domains', []): - if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): - element.decompose() - return False + except Exception as e: raise Exception(f"Error processing links: {str(e)}") @@ -414,26 +429,40 @@ class WebScrapingStrategy(ContentScrapingStrategy): if 'srcset' in element.attrs: src = element.attrs['srcset'].split(',')[0].split(' ')[0] + # If image src is internal, then skip + if not is_external_url(src, base_domain): + return True + + image_src_base_domain = get_base_domain(src) + # Check flag if we should remove external images if kwargs.get('exclude_external_images', False): - src_url_base = src.split('/')[2] - url_base = url.split('/')[2] - if url_base not in src_url_base: - element.decompose() - return False + element.decompose() + return False + # src_url_base = src.split('/')[2] + # url_base = url.split('/')[2] + # if url_base not in src_url_base: + # element.decompose() + # return False - if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False): - src_url_base = src.split('/')[2] - url_base = url.split('/')[2] - if any(domain in src for domain in exclude_social_media_domains): - element.decompose() - return False + # if kwargs.get('exclude_social_media_links', False): + # if image_src_base_domain in exclude_social_media_domains: + # element.decompose() + # return False + # src_url_base = src.split('/')[2] + # url_base = url.split('/')[2] + # if any(domain in src for domain in exclude_social_media_domains): + # element.decompose() + # return False # Handle exclude domains - if kwargs.get('exclude_domains', []): - if any(domain in src for domain in kwargs.get('exclude_domains', [])): + if exclude_domains: + if image_src_base_domain in exclude_domains: element.decompose() return False + # if any(domain in src for domain in kwargs.get('exclude_domains', [])): + # element.decompose() + # return False return True # Always keep image elements except Exception as e: @@ -511,6 +540,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): soup = BeautifulSoup(html, 'lxml') body = soup.body + base_domain = get_base_domain(url) try: meta = extract_metadata("", soup) @@ -556,10 +586,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): for el in selected_elements: body.append(el) + kwargs['exclude_social_media_domains'] = set(kwargs.get('exclude_social_media_domains', []) + SOCIAL_MEDIA_DOMAINS) + kwargs['exclude_domains'] = set(kwargs.get('exclude_domains', [])) + if kwargs.get('exclude_social_media_links', False): + kwargs['exclude_domains'] = kwargs['exclude_domains'].union(kwargs['exclude_social_media_domains']) + result_obj = self.process_element( url, body, word_count_threshold = word_count_threshold, + base_domain=base_domain, **kwargs ) diff --git a/crawl4ai/docs_manager.py b/crawl4ai/docs_manager.py index d7b2c415..aacc5812 100644 --- a/crawl4ai/docs_manager.py +++ b/crawl4ai/docs_manager.py @@ -1,59 +1,67 @@ -import os import requests +import shutil from pathlib import Path -from typing import Optional, List -from .async_logger import AsyncLogger -from .llmtxt import LLMTextManager +from crawl4ai.async_logger import AsyncLogger +from crawl4ai.llmtxt import AsyncLLMTextManager class DocsManager: - BASE_URL = "https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/llm.txt" - - def __init__(self, logger: Optional[AsyncLogger] = None): + def __init__(self, logger=None): self.docs_dir = Path.home() / ".crawl4ai" / "docs" + self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt" self.docs_dir.mkdir(parents=True, exist_ok=True) self.logger = logger or AsyncLogger(verbose=True) - self.llm_text = LLMTextManager(self.docs_dir, self.logger) - + self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger) + async def ensure_docs_exist(self): - """Ensure docs are downloaded, fetch if not present""" + """Fetch docs if not present""" if not any(self.docs_dir.iterdir()): - self.logger.info("Documentation not found, downloading...", tag="DOCS") - await self.update_docs() - - async def update_docs(self) -> bool: - """Always fetch latest docs""" + await self.fetch_docs() + + async def fetch_docs(self) -> bool: + """Copy from local docs or download from GitHub""" try: - self.logger.info("Fetching documentation files...", tag="DOCS") - - # Get file list - response = requests.get(f"{self.BASE_URL}/files.json") + # Try local first + if self.local_docs.exists() and (any(self.local_docs.glob("*.md")) or any(self.local_docs.glob("*.tokens"))): + # Empty the local docs directory + for file_path in self.docs_dir.glob("*.md"): + file_path.unlink() + # for file_path in self.docs_dir.glob("*.tokens"): + # file_path.unlink() + for file_path in self.local_docs.glob("*.md"): + shutil.copy2(file_path, self.docs_dir / file_path.name) + # for file_path in self.local_docs.glob("*.tokens"): + # shutil.copy2(file_path, self.docs_dir / file_path.name) + return True + + # Fallback to GitHub + response = requests.get( + "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt", + headers={'Accept': 'application/vnd.github.v3+json'} + ) response.raise_for_status() - files = response.json()["files"] - # Download each file - for file in files: - response = requests.get(f"{self.BASE_URL}/{file}") - response.raise_for_status() - - file_path = self.docs_dir / file - with open(file_path, 'w', encoding='utf-8') as f: - f.write(response.text) - - self.logger.debug(f"Downloaded {file}", tag="DOCS") - - self.logger.success("Documentation updated successfully", tag="DOCS") + for item in response.json(): + if item['type'] == 'file' and item['name'].endswith('.md'): + content = requests.get(item['download_url']).text + with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f: + f.write(content) return True - + except Exception as e: - self.logger.error(f"Failed to update documentation: {str(e)}", tag="ERROR") + self.logger.error(f"Failed to fetch docs: {str(e)}") raise + + def list(self) -> list[str]: + """List available topics""" + names = [file_path.stem for file_path in self.docs_dir.glob("*.md")] + # Remove [0-9]+_ prefix + names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names] + # Exclude those end with .xs.md and .q.md + names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")] + return names - # Delegate LLM text operations to LLMTextManager - def get_file_map(self) -> dict: - return self.llm_text.get_file_map() + def generate(self, sections, mode="extended"): + return self.llm_text.generate(sections, mode) - def concatenate_docs(self, sections: List[str], mode: str) -> str: - return self.llm_text.concatenate_docs(sections, mode) - - def search_questions(self, query: str, top_k: int = 5) -> str: - return self.llm_text.search_questions(query, top_k) + def search(self, query: str, top_k: int = 5): + return self.llm_text.search(query, top_k) \ No newline at end of file diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 8e5f2928..c2ba891e 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -6,6 +6,7 @@ import json, time from .prompts import * from .config import * from .utils import * +from .models import * from functools import partial from .model_loader import * import math @@ -13,13 +14,23 @@ import numpy as np import re from bs4 import BeautifulSoup from lxml import html, etree +from dataclasses import dataclass class ExtractionStrategy(ABC): """ Abstract base class for all extraction strategies. """ - def __init__(self, **kwargs): + def __init__(self, input_format: str = "markdown", **kwargs): + """ + Initialize the extraction strategy. + + Args: + input_format: Content format to use for extraction. + Options: "markdown" (default), "html", "fit_markdown" + **kwargs: Additional keyword arguments + """ + self.input_format = input_format self.DEL = "<|DEL|>" self.name = self.__class__.__name__ self.verbose = kwargs.get("verbose", False) @@ -62,6 +73,8 @@ class NoExtractionStrategy(ExtractionStrategy): # Strategies using LLM-based extraction for text data # ####################################################### + + class LLMExtractionStrategy(ExtractionStrategy): def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, @@ -73,7 +86,7 @@ class LLMExtractionStrategy(ExtractionStrategy): :param api_token: The API token for the provider. :param instruction: The instruction to use for the LLM model. """ - super().__init__() + super().__init__(**kwargs) self.provider = provider self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") self.instruction = instruction @@ -93,6 +106,8 @@ class LLMExtractionStrategy(ExtractionStrategy): self.chunk_token_threshold = 1e9 self.verbose = kwargs.get("verbose", False) + self.usages = [] # Store individual usages + self.total_usage = TokenUsage() # Accumulated usage if not self.api_token: raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") @@ -129,6 +144,21 @@ class LLMExtractionStrategy(ExtractionStrategy): base_url=self.api_base or self.base_url, extra_args = self.extra_args ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} + ) + self.usages.append(usage) + + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + try: blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] blocks = json.loads(blocks) @@ -238,6 +268,22 @@ class LLMExtractionStrategy(ExtractionStrategy): return extracted_content + + + def show_usage(self) -> None: + """Print a detailed token usage report showing total and per-request usage.""" + print("\n=== Token Usage Summary ===") + print(f"{'Type':<15} {'Count':>12}") + print("-" * 30) + print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") + print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") + print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") + + print("\n=== Usage History ===") + print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") + print("-" * 48) + for i, usage in enumerate(self.usages, 1): + print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") ####################################################### @@ -256,7 +302,7 @@ class CosineStrategy(ExtractionStrategy): linkage_method (str): The linkage method for hierarchical clustering. top_k (int): Number of top categories to extract. """ - super().__init__() + super().__init__(**kwargs) import numpy as np @@ -537,7 +583,7 @@ class TopicExtractionStrategy(ExtractionStrategy): :param num_keywords: Number of keywords to represent each topic segment. """ import nltk - super().__init__() + super().__init__(**kwargs) self.num_keywords = num_keywords self.tokenizer = nltk.TextTilingTokenizer() @@ -604,6 +650,7 @@ class ContentSummarizationStrategy(ExtractionStrategy): :param model_name: The model to use for summarization. """ + super().__init__(**kwargs) from transformers import pipeline self.summarizer = pipeline("summarization", model=model_name) @@ -809,6 +856,10 @@ class JsonElementExtractionStrategy(ExtractionStrategy): pass class JsonCssExtractionStrategy(JsonElementExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + def _parse_html(self, html_content: str): return BeautifulSoup(html_content, 'html.parser') @@ -829,6 +880,10 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy): return element.get(attribute) class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + def _parse_html(self, html_content: str): return html.fromstring(html_content) @@ -869,6 +924,7 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): class _JsonCssExtractionStrategy(ExtractionStrategy): def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input super().__init__(**kwargs) self.schema = schema @@ -983,6 +1039,7 @@ class _JsonCssExtractionStrategy(ExtractionStrategy): return self.extract(url, combined_html, **kwargs) class _JsonXPathExtractionStrategy(ExtractionStrategy): def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input super().__init__(**kwargs) self.schema = schema diff --git a/crawl4ai/llmtxt.py b/crawl4ai/llmtxt.py index 1c27cdc9..94efe076 100644 --- a/crawl4ai/llmtxt.py +++ b/crawl4ai/llmtxt.py @@ -1,196 +1,498 @@ import os from pathlib import Path -from rank_bm25 import BM25Okapi import re -from typing import List, Literal - +from typing import Dict, List, Tuple, Optional, Any +import json +from tqdm import tqdm +import time +import psutil +import numpy as np +from rank_bm25 import BM25Okapi from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer -import nltk +from litellm import completion, batch_completion +from .async_logger import AsyncLogger +import litellm +import pickle +import hashlib # <--- ADDED for file-hash +from fnmatch import fnmatch +import glob +litellm.set_verbose = False -BASE_PATH = Path(__file__).resolve().parent +def _compute_file_hash(file_path: Path) -> str: + """Compute MD5 hash for the file's entire content.""" + hash_md5 = hashlib.md5() + with file_path.open("rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() -class LLMTextManager: - """Manages LLM text operations and caching""" - - def __init__(self, docs_dir: Path, logger: Optional['AsyncLogger'] = None): +class AsyncLLMTextManager: + def __init__( + self, + docs_dir: Path, + logger: Optional[AsyncLogger] = None, + max_concurrent_calls: int = 5, + batch_size: int = 3 + ) -> None: self.docs_dir = docs_dir self.logger = logger - - def get_file_map(self) -> dict: - """Cache file mappings to avoid repeated directory scans""" - files = os.listdir(self.docs_dir) - file_map = {} - - for file in files: - if file.endswith('.md'): - # Extract number and name: "6_chunking_strategies.md" -> ("chunking_strategies", "6") - match = re.match(r'(\d+)_(.+?)(?:\.(?:ex|xs|sm|q)?\.md)?$', file) - if match: - num, name = match.groups() - if name not in file_map: - file_map[name] = num - return file_map + self.max_concurrent_calls = max_concurrent_calls + self.batch_size = batch_size + self.bm25_index = None + self.document_map: Dict[str, Any] = {} + self.tokenized_facts: List[str] = [] + self.bm25_index_file = self.docs_dir / "bm25_index.pkl" - def concatenate_docs(self, file_names: List[str], mode: str) -> str: - """Concatenate documentation files based on names and mode.""" - file_map = self.get_file_map() - result = [] - suffix_map = { - "extended": ".ex.md", - "condensed": [".xs.md", ".sm.md"] - } - - for name in file_names: - if name not in file_map: - continue - - num = file_map[name] - base_path = self.docs_dir - - if mode == "extended": - file_path = base_path / f"{num}_{name}{suffix_map[mode]}" - if not file_path.exists(): - file_path = base_path / f"{num}_{name}.md" - else: - file_path = None - for suffix in suffix_map["condensed"]: - temp_path = base_path / f"{num}_{name}{suffix}" - if temp_path.exists(): - file_path = temp_path - break - if not file_path: - file_path = base_path / f"{num}_{name}.md" - - if file_path.exists(): + async def _process_document_batch(self, doc_batch: List[Path]) -> None: + """Process a batch of documents in parallel""" + contents = [] + for file_path in doc_batch: + try: with open(file_path, 'r', encoding='utf-8') as f: - result.append(f.read()) - - return "\n\n---\n\n".join(result) + contents.append(f.read()) + except Exception as e: + self.logger.error(f"Error reading {file_path}: {str(e)}") + contents.append("") # Add empty content to maintain batch alignment - def search_questions(self, query: str, top_k: int = 5) -> str: - """Search through Q files using BM25 ranking and return top K matches.""" - q_files = [f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")] - # Prepare base path for file reading - q_files = [self.docs_dir / f for f in q_files] # Convert to full path - - documents = [] - file_contents = {} - - for file in q_files: - with open(file, 'r', encoding='utf-8') as f: - content = f.read() - questions = extract_questions(content) - for category, question, full_section in questions: - documents.append(question) - file_contents[question] = (file, category, full_section) + prompt = """Given a documentation file, generate a list of atomic facts where each fact: +1. Represents a single piece of knowledge +2. Contains variations in terminology for the same concept +3. References relevant code patterns if they exist +4. Is written in a way that would match natural language queries - if not documents: - return "No questions found in documentation." +Each fact should follow this format: +: | | - tokenized_docs = [preprocess_text(doc) for doc in documents] - tokenized_query = preprocess_text(query) +Example Facts: +browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True) +redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0) +pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5] + +Wrap your response in ... tags. +""" + + # Prepare messages for batch processing + messages_list = [ + [ + {"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"} + ] + for content in contents if content + ] + + try: + responses = batch_completion( + model="anthropic/claude-3-5-sonnet-latest", + messages=messages_list, + logger_fn=None + ) + + # Process responses and save index files + for response, file_path in zip(responses, doc_batch): + try: + index_content_match = re.search( + r'(.*?)', + response.choices[0].message.content, + re.DOTALL + ) + if not index_content_match: + self.logger.warning(f"No ... content found for {file_path}") + continue + + index_content = re.sub( + r"\n\s*\n", "\n", index_content_match.group(1) + ).strip() + if index_content: + index_file = file_path.with_suffix('.q.md') + with open(index_file, 'w', encoding='utf-8') as f: + f.write(index_content) + self.logger.info(f"Created index file: {index_file}") + else: + self.logger.warning(f"No index content found in response for {file_path}") + + except Exception as e: + self.logger.error(f"Error processing response for {file_path}: {str(e)}") + + except Exception as e: + self.logger.error(f"Error in batch completion: {str(e)}") + + def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]: + if "|" not in line: + return False, "Missing separator '|'" + + parts = [p.strip() for p in line.split("|")] + if len(parts) != 3: + return False, f"Expected 3 parts, got {len(parts)}" + + concept_part = parts[0] + if ":" not in concept_part: + return False, "Missing ':' in concept definition" + + return True, None + + def _load_or_create_token_cache(self, fact_file: Path) -> Dict: + """ + Load token cache from .q.tokens if present and matching file hash. + Otherwise return a new structure with updated file-hash. + """ + cache_file = fact_file.with_suffix(".q.tokens") + current_hash = _compute_file_hash(fact_file) + + if cache_file.exists(): + try: + with open(cache_file, "r") as f: + cache = json.load(f) + # If the hash matches, return it directly + if cache.get("content_hash") == current_hash: + return cache + # Otherwise, we signal that it's changed + self.logger.info(f"Hash changed for {fact_file}, reindex needed.") + except json.JSONDecodeError: + self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.") + except Exception as e: + self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}") + + # Return a fresh cache + return {"facts": {}, "content_hash": current_hash} + + def _save_token_cache(self, fact_file: Path, cache: Dict) -> None: + cache_file = fact_file.with_suffix(".q.tokens") + # Always ensure we're saving the correct file-hash + cache["content_hash"] = _compute_file_hash(fact_file) + with open(cache_file, "w") as f: + json.dump(cache, f) + + def preprocess_text(self, text: str) -> List[str]: + parts = [x.strip() for x in text.split("|")] if "|" in text else [text] + # Remove : after the first word of parts[0] + parts[0] = re.sub(r"^(.*?):", r"\1", parts[0]) + + lemmatizer = WordNetLemmatizer() + stop_words = set(stopwords.words("english")) - { + "how", "what", "when", "where", "why", "which", + } + + tokens = [] + for part in parts: + if "(" in part and ")" in part: + code_tokens = re.findall( + r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part + ) + tokens.extend(code_tokens) + + words = word_tokenize(part.lower()) + tokens.extend( + [ + lemmatizer.lemmatize(token) + for token in words + if token not in stop_words + ] + ) + + return tokens + + def maybe_load_bm25_index(self, clear_cache=False) -> bool: + """ + Load existing BM25 index from disk, if present and clear_cache=False. + """ + if not clear_cache and os.path.exists(self.bm25_index_file): + self.logger.info("Loading existing BM25 index from disk.") + with open(self.bm25_index_file, "rb") as f: + data = pickle.load(f) + self.tokenized_facts = data["tokenized_facts"] + self.bm25_index = data["bm25_index"] + return True + return False + + def build_search_index(self, clear_cache=False) -> None: + """ + Checks for new or modified .q.md files by comparing file-hash. + If none need reindexing and clear_cache is False, loads existing index if available. + Otherwise, reindexes only changed/new files and merges or creates a new index. + """ + # If clear_cache is True, we skip partial logic: rebuild everything from scratch + if clear_cache: + self.logger.info("Clearing cache and rebuilding full search index.") + if self.bm25_index_file.exists(): + self.bm25_index_file.unlink() + + process = psutil.Process() + self.logger.info("Checking which .q.md files need (re)indexing...") + + # Gather all .q.md files + q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")] + + # We'll store known (unchanged) facts in these lists + existing_facts: List[str] = [] + existing_tokens: List[List[str]] = [] + + # Keep track of invalid lines for logging + invalid_lines = [] + needSet = [] # files that must be (re)indexed + + for qf in q_files: + token_cache_file = qf.with_suffix(".q.tokens") + + # If no .q.tokens or clear_cache is True → definitely reindex + if clear_cache or not token_cache_file.exists(): + needSet.append(qf) + continue + + # Otherwise, load the existing cache and compare hash + cache = self._load_or_create_token_cache(qf) + # If the .q.tokens was out of date (i.e. changed hash), we reindex + if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf): + needSet.append(qf) + else: + # File is unchanged → retrieve cached token data + for line, cache_data in cache["facts"].items(): + existing_facts.append(line) + existing_tokens.append(cache_data["tokens"]) + self.document_map[line] = qf # track the doc for that fact + + if not needSet and not clear_cache: + # If no file needs reindexing, try loading existing index + if self.maybe_load_bm25_index(clear_cache=False): + self.logger.info("No new/changed .q.md files found. Using existing BM25 index.") + return + else: + # If there's no existing index, we must build a fresh index from the old caches + self.logger.info("No existing BM25 index found. Building from cached facts.") + if existing_facts: + self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.") + self.bm25_index = BM25Okapi(existing_tokens) + self.tokenized_facts = existing_facts + with open(self.bm25_index_file, "wb") as f: + pickle.dump({ + "bm25_index": self.bm25_index, + "tokenized_facts": self.tokenized_facts + }, f) + else: + self.logger.warning("No facts found at all. Index remains empty.") + return + + # ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md' + # If we reach here, we have new or changed .q.md files + # We'll parse them, reindex them, and then combine with existing_facts + # ----------------------------------------------------- + + self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...") + + # 1) Parse the new or changed .q.md files + new_facts = [] + new_tokens = [] + with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar: + for file in needSet: + # We'll build up a fresh cache + fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)} + try: + with open(file, "r", encoding="utf-8") as f_obj: + content = f_obj.read().strip() + lines = [l.strip() for l in content.split("\n") if l.strip()] + + for line in lines: + is_valid, error = self._validate_fact_line(line) + if not is_valid: + invalid_lines.append((file, line, error)) + continue + + tokens = self.preprocess_text(line) + fresh_cache["facts"][line] = { + "tokens": tokens, + "added": time.time(), + } + new_facts.append(line) + new_tokens.append(tokens) + self.document_map[line] = file + + # Save the new .q.tokens with updated hash + self._save_token_cache(file, fresh_cache) + + mem_usage = process.memory_info().rss / 1024 / 1024 + self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB") + + except Exception as e: + self.logger.error(f"Error processing {file}: {str(e)}") + + file_pbar.update(1) + + if invalid_lines: + self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:") + for file, line, error in invalid_lines: + self.logger.warning(f"{file}: {error} in line: {line[:50]}...") + + # 2) Merge newly tokenized facts with the existing ones + all_facts = existing_facts + new_facts + all_tokens = existing_tokens + new_tokens + + # 3) Build BM25 index from combined facts + self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).") + self.bm25_index = BM25Okapi(all_tokens) + self.tokenized_facts = all_facts + + # 4) Save the updated BM25 index to disk + with open(self.bm25_index_file, "wb") as f: + pickle.dump({ + "bm25_index": self.bm25_index, + "tokenized_facts": self.tokenized_facts + }, f) + + final_mem = process.memory_info().rss / 1024 / 1024 + self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB") + + async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None: + """ + Generate index files for all documents in parallel batches - bm25 = BM25Okapi(tokenized_docs) - doc_scores = bm25.get_scores(tokenized_query) + Args: + force_generate_facts (bool): If True, regenerate indexes even if they exist + clear_bm25_cache (bool): If True, clear existing BM25 index cache + """ + self.logger.info("Starting index generation for documentation files.") - score_threshold = max(doc_scores) * 0.4 + md_files = [ + self.docs_dir / f for f in os.listdir(self.docs_dir) + if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md']) + ] + + # Filter out files that already have .q files unless force=True + if not force_generate_facts: + md_files = [ + f for f in md_files + if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists() + ] + + if not md_files: + self.logger.info("All index files exist. Use force=True to regenerate.") + else: + # Process documents in batches + for i in range(0, len(md_files), self.batch_size): + batch = md_files[i:i + self.batch_size] + self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}") + await self._process_document_batch(batch) + + self.logger.info("Index generation complete, building/updating search index.") + self.build_search_index(clear_cache=clear_bm25_cache) + + def generate(self, sections: List[str], mode: str = "extended") -> str: + # Get all markdown files + all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \ + glob.glob(str(self.docs_dir / "[0-9]*.xs.md")) - # Aggregate scores by file - file_data = {} - for idx, score in enumerate(doc_scores): - if score > score_threshold: - question = documents[idx] - file, category, _ = file_contents[question] - - if file not in file_data: - file_data[file] = { - 'total_score': 0, - 'match_count': 0, - 'questions': [] - } - - file_data[file]['total_score'] += score - file_data[file]['match_count'] += 1 - file_data[file]['questions'].append({ - 'category': category, - 'question': question, - 'score': score - }) + # Extract base names without extensions + base_docs = {Path(f).name.split('.')[0] for f in all_files + if not Path(f).name.endswith('.q.md')} - # Sort files by match count and total score + # Filter by sections if provided + if sections: + base_docs = {doc for doc in base_docs + if any(section.lower() in doc.lower() for section in sections)} + + # Get file paths based on mode + files = [] + for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999): + if mode == "condensed": + xs_file = self.docs_dir / f"{doc}.xs.md" + regular_file = self.docs_dir / f"{doc}.md" + files.append(str(xs_file if xs_file.exists() else regular_file)) + else: + files.append(str(self.docs_dir / f"{doc}.md")) + + # Read and format content + content = [] + for file in files: + try: + with open(file, 'r', encoding='utf-8') as f: + fname = Path(file).name + content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}") + except Exception as e: + self.logger.error(f"Error reading {file}: {str(e)}") + + return "\n\n---\n\n".join(content) if content else "" + + def search(self, query: str, top_k: int = 5) -> str: + if not self.bm25_index: + return "No search index available. Call build_search_index() first." + + query_tokens = self.preprocess_text(query) + doc_scores = self.bm25_index.get_scores(query_tokens) + + mean_score = np.mean(doc_scores) + std_score = np.std(doc_scores) + score_threshold = mean_score + (0.25 * std_score) + + file_data = self._aggregate_search_scores( + doc_scores=doc_scores, + score_threshold=score_threshold, + query_tokens=query_tokens, + ) + ranked_files = sorted( file_data.items(), - key=lambda x: (x[1]['match_count'], x[1]['total_score']), - reverse=True + key=lambda x: ( + x[1]["code_match_score"] * 2.0 + + x[1]["match_count"] * 1.5 + + x[1]["total_score"] + ), + reverse=True, )[:top_k] - - # Format results by file + results = [] - for file, data in ranked_files: - questions_summary = "\n".join( - f"- [{q['category']}] {q['question']} (score: {q['score']:.2f})" - for q in sorted(data['questions'], key=lambda x: x['score'], reverse=True) + for file, _ in ranked_files: + main_doc = str(file).replace(".q.md", ".md") + if os.path.exists(self.docs_dir / main_doc): + with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f: + only_file_name = main_doc.split("/")[-1] + content = [ + "#" * 20, + f"# {only_file_name}", + "#" * 20, + "", + f.read() + ] + results.append("\n".join(content)) + + return "\n\n---\n\n".join(results) + + def _aggregate_search_scores( + self, doc_scores: List[float], score_threshold: float, query_tokens: List[str] + ) -> Dict: + file_data = {} + + for idx, score in enumerate(doc_scores): + if score <= score_threshold: + continue + + fact = self.tokenized_facts[idx] + file_path = self.document_map[fact] + + if file_path not in file_data: + file_data[file_path] = { + "total_score": 0, + "match_count": 0, + "code_match_score": 0, + "matched_facts": [], + } + + components = fact.split("|") if "|" in fact else [fact] + + code_match_score = 0 + if len(components) == 3: + code_ref = components[2].strip() + code_tokens = self.preprocess_text(code_ref) + code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens) + + file_data[file_path]["total_score"] += score + file_data[file_path]["match_count"] += 1 + file_data[file_path]["code_match_score"] = max( + file_data[file_path]["code_match_score"], code_match_score ) - - results.append( - f"File: {file}\n" - f"Match Count: {data['match_count']}\n" - f"Total Score: {data['total_score']:.2f}\n\n" - f"Matching Questions:\n{questions_summary}" - ) - - return "\n\n---\n\n".join(results) if results else "No relevant matches found." + file_data[file_path]["matched_facts"].append(fact) -def extract_questions(content: str) -> List[tuple[str, str, str]]: - """ - Extract questions from Q files, returning list of (category, question, full_section). - """ - # Split into main sections (### Questions or ### Hypothetical Questions) - sections = re.split(r'^###\s+.*Questions\s*$', content, flags=re.MULTILINE)[1:] - - results = [] - for section in sections: - # Find all numbered categories (1. **Category Name**) - categories = re.split(r'^\d+\.\s+\*\*([^*]+)\*\*\s*$', section.strip(), flags=re.MULTILINE) - - # Process each category - for i in range(1, len(categories), 2): - category = categories[i].strip() - category_content = categories[i+1].strip() - - # Extract questions (lines starting with dash and wrapped in italics) - questions = re.findall(r'^\s*-\s*\*"([^"]+)"\*\s*$', category_content, flags=re.MULTILINE) - - # Add each question with its category and full context - for q in questions: - results.append((category, q, f"Category: {category}\nQuestion: {q}")) - - return results + return file_data -def preprocess_text(text: str) -> List[str]: - """Preprocess text for better semantic matching""" - # Lowercase and tokenize - tokens = word_tokenize(text.lower()) - - # Remove stopwords but keep question words - stop_words = set(stopwords.words('english')) - {'how', 'what', 'when', 'where', 'why', 'which'} - lemmatizer = WordNetLemmatizer() - - # Lemmatize but preserve original form for technical terms - tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] - - return tokens - -if __name__ == "__main__": - llm_manager = LLMTextManager(BASE_PATH) - - # Example 1: Concatenate docs - docs = llm_manager.concatenate_docs(["chunking_strategies", "content_selection"], "extended") - print("Concatenated docs:", docs[:200], "...\n") - - # Example 2: Search questions - results = llm_manager.search_questions("How do I execute JS script on the page?", 3) - print("Search results:", results[:200], "...") \ No newline at end of file + def refresh_index(self) -> None: + """Convenience method for a full rebuild.""" + self.build_search_index(clear_cache=True) diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 315069fb..4119c62d 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,7 +1,14 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional, Callable, Awaitable, Union - - +from typing import List, Dict, Optional, Callable, Awaitable, Union, Any +from dataclasses import dataclass +@dataclass +class TokenUsage: + completion_tokens: int = 0 + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens_details: Optional[dict] = None + prompt_tokens_details: Optional[dict] = None + class UrlModel(BaseModel): url: HttpUrl @@ -34,7 +41,8 @@ class CrawlResult(BaseModel): session_id: Optional[str] = None response_headers: Optional[dict] = None status_code: Optional[int] = None - + ssl_certificate: Optional[Dict[str, Any]] = None + class AsyncCrawlResponse(BaseModel): html: str response_headers: Dict[str, str] @@ -43,8 +51,7 @@ class AsyncCrawlResponse(BaseModel): pdf_data: Optional[bytes] = None get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None downloaded_files: Optional[List[str]] = None + ssl_certificate: Optional[Dict[str, Any]] = None class Config: arbitrary_types_allowed = True - - diff --git a/crawl4ai/utilities/cert_exporter.py b/crawl4ai/utilities/cert_exporter.py new file mode 100644 index 00000000..2249fcc4 --- /dev/null +++ b/crawl4ai/utilities/cert_exporter.py @@ -0,0 +1,156 @@ +"""Utility functions for exporting SSL certificates in various formats.""" + +import json +import base64 +from typing import Dict, Any, Optional +from pathlib import Path +import OpenSSL.crypto +from datetime import datetime + +class CertificateExporter: + """ + Handles exporting SSL certificates in various formats: + 1. JSON - Human-readable format with all certificate details + 2. PEM - Standard text format for certificates + 3. DER - Binary format + """ + + @staticmethod + def _decode_cert_data(data: Any) -> Any: + """Helper method to decode bytes in certificate data.""" + if isinstance(data, bytes): + return data.decode('utf-8') + elif isinstance(data, dict): + return { + (k.decode('utf-8') if isinstance(k, bytes) else k): CertificateExporter._decode_cert_data(v) + for k, v in data.items() + } + elif isinstance(data, list): + return [CertificateExporter._decode_cert_data(item) for item in data] + return data + + @staticmethod + def to_json(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[str]: + """ + Export certificate information to JSON format. + + Args: + cert_info: Dictionary containing certificate information + filepath: Optional path to save the JSON file + + Returns: + str: JSON string if filepath is None, otherwise None + """ + if not cert_info: + return None + + # Decode any bytes in the certificate data + cert_data = CertificateExporter._decode_cert_data(cert_info) + + # Convert datetime objects to ISO format strings + for key, value in cert_data.items(): + if isinstance(value, datetime): + cert_data[key] = value.isoformat() + + json_str = json.dumps(cert_data, indent=2, ensure_ascii=False) + + if filepath: + Path(filepath).write_text(json_str, encoding='utf-8') + return None + return json_str + + @staticmethod + def to_pem(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[str]: + """ + Export certificate to PEM format. + This is the most common format, used for Apache/Nginx configs. + + Args: + cert_info: Dictionary containing certificate information + filepath: Optional path to save the PEM file + + Returns: + str: PEM string if filepath is None, otherwise None + """ + if not cert_info or 'raw_cert' not in cert_info: + return None + + try: + x509 = OpenSSL.crypto.load_certificate( + OpenSSL.crypto.FILETYPE_ASN1, + base64.b64decode(cert_info['raw_cert']) + ) + pem_data = OpenSSL.crypto.dump_certificate( + OpenSSL.crypto.FILETYPE_PEM, + x509 + ).decode('utf-8') + + if filepath: + Path(filepath).write_text(pem_data, encoding='utf-8') + return None + return pem_data + + except Exception as e: + return f"Error converting to PEM: {str(e)}" + + @staticmethod + def to_der(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[bytes]: + """ + Export certificate to DER format (binary). + This format is commonly used in Java environments. + + Args: + cert_info: Dictionary containing certificate information + filepath: Optional path to save the DER file + + Returns: + bytes: DER bytes if filepath is None, otherwise None + """ + if not cert_info or 'raw_cert' not in cert_info: + return None + + try: + der_data = base64.b64decode(cert_info['raw_cert']) + + if filepath: + Path(filepath).write_bytes(der_data) + return None + return der_data + + except Exception as e: + return None + + @staticmethod + def export_all(cert_info: Dict[str, Any], base_path: str, filename: str) -> Dict[str, str]: + """ + Export certificate in all supported formats. + + Args: + cert_info: Dictionary containing certificate information + base_path: Base directory to save the files + filename: Base filename without extension + + Returns: + Dict[str, str]: Dictionary mapping format to filepath + """ + base_path = Path(base_path) + base_path.mkdir(parents=True, exist_ok=True) + + paths = {} + + # Export JSON + json_path = base_path / f"{filename}.json" + CertificateExporter.to_json(cert_info, str(json_path)) + paths['json'] = str(json_path) + + # Export PEM + pem_path = base_path / f"{filename}.pem" + CertificateExporter.to_pem(cert_info, str(pem_path)) + paths['pem'] = str(pem_path) + + # Export DER + der_path = base_path / f"{filename}.der" + CertificateExporter.to_der(cert_info, str(der_path)) + paths['der'] = str(der_path) + + return paths diff --git a/crawl4ai/utilities/ssl_utils.py b/crawl4ai/utilities/ssl_utils.py new file mode 100644 index 00000000..ea018d7b --- /dev/null +++ b/crawl4ai/utilities/ssl_utils.py @@ -0,0 +1,83 @@ +"""Utility functions for SSL certificate handling.""" + +import ssl +import socket +from typing import Dict, Any, Optional +from urllib.parse import urlparse +import OpenSSL.crypto +import datetime +import base64 + + +def get_ssl_certificate(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]: + """ + Retrieve SSL certificate information from a given URL. + + Args: + url (str): The URL to get SSL certificate from + timeout (int): Socket timeout in seconds + + Returns: + Optional[Dict[str, Any]]: Dictionary containing certificate information or None if not available + + The returned dictionary includes: + - subject: Certificate subject information + - issuer: Certificate issuer information + - version: SSL version + - serial_number: Certificate serial number + - not_before: Certificate validity start date + - not_after: Certificate validity end date + - fingerprint: Certificate fingerprint + - raw_cert: Base64 encoded raw certificate data + """ + try: + hostname = urlparse(url).netloc + if ':' in hostname: + hostname = hostname.split(':')[0] + + context = ssl.create_default_context() + with socket.create_connection((hostname, 443), timeout=timeout) as sock: + with context.wrap_socket(sock, server_hostname=hostname) as ssock: + cert_binary = ssock.getpeercert(binary_form=True) + x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary) + + cert_info = { + "subject": { + key: value.decode() if isinstance(value, bytes) else value + for key, value in dict(x509.get_subject().get_components()).items() + }, + "issuer": { + key: value.decode() if isinstance(value, bytes) else value + for key, value in dict(x509.get_issuer().get_components()).items() + }, + "version": x509.get_version(), + "serial_number": hex(x509.get_serial_number()), + "not_before": x509.get_notBefore().decode(), + "not_after": x509.get_notAfter().decode(), + "fingerprint": x509.digest("sha256").hex(), + "signature_algorithm": x509.get_signature_algorithm().decode(), + "raw_cert": base64.b64encode(cert_binary).decode('utf-8') + } + + # Add extensions + extensions = [] + for i in range(x509.get_extension_count()): + ext = x509.get_extension(i) + extensions.append({ + "name": ext.get_short_name().decode(), + "value": str(ext) + }) + cert_info["extensions"] = extensions + + return cert_info + + except (socket.gaierror, socket.timeout, ssl.SSLError, ValueError) as e: + return { + "error": str(e), + "status": "failed" + } + except Exception as e: + return { + "error": f"Unexpected error: {str(e)}", + "status": "failed" + } diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 7ecc22da..de08e02b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1,4 +1,5 @@ import time +from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString import json @@ -6,7 +7,6 @@ import html import re import os import platform -from .html2text import HTML2Text from .prompts import PROMPT_EXTRACT_BLOCKS from .config import * from pathlib import Path @@ -14,7 +14,6 @@ from typing import Dict, Any from urllib.parse import urljoin import requests from requests.exceptions import InvalidSchema -import hashlib from typing import Optional, Tuple, Dict, Any import xxhash from colorama import Fore, Style, init @@ -1110,21 +1109,52 @@ def normalize_url_tmp(href, base_url): return href.strip() -def is_external_url(url, base_domain): - """Determine if a URL is external""" - special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} - if any(url.lower().startswith(proto) for proto in special_protocols): +def get_base_domain(url: str) -> str: + """Extract base domain from URL, handling various edge cases.""" + try: + # Get domain from URL + domain = urlparse(url).netloc.lower() + if not domain: + return "" + + # Remove port if present + domain = domain.split(':')[0] + + # Remove www + domain = re.sub(r'^www\.', '', domain) + + # Extract last two parts of domain (handles co.uk etc) + parts = domain.split('.') + if len(parts) > 2 and parts[-2] in { + 'co', 'com', 'org', 'gov', 'edu', 'net', + 'mil', 'int', 'ac', 'ad', 'ae', 'af', 'ag' + }: + return '.'.join(parts[-3:]) + + return '.'.join(parts[-2:]) + except Exception: + return "" + +def is_external_url(url: str, base_domain: str) -> bool: + """Check if URL is external to base domain.""" + special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} + if any(url.lower().startswith(p) for p in special): return True try: - # Handle URLs with protocol - if url.startswith(('http://', 'https://')): - url_domain = url.split('/')[2] - return base_domain.lower() not in url_domain.lower() - except IndexError: - return False + parsed = urlparse(url) + if not parsed.netloc: # Relative URL + return False + + # Strip 'www.' from both domains for comparison + url_domain = parsed.netloc.lower().replace('www.', '') + base = base_domain.lower().replace('www.', '') - return False + # Check if URL domain ends with base domain + return not url_domain.endswith(base) + except Exception: + return False + def clean_tokens(tokens: list[str]) -> list[str]: # Set of tokens to remove @@ -1289,4 +1319,7 @@ def get_error_context(exc_info, context_lines: int = 5): "line_no": line_no, "function": func_name, "code_context": code_context - } \ No newline at end of file + } + + + \ No newline at end of file diff --git a/docs/examples/browser_optimization_example.py b/docs/examples/browser_optimization_example.py new file mode 100644 index 00000000..f57dc147 --- /dev/null +++ b/docs/examples/browser_optimization_example.py @@ -0,0 +1,128 @@ +""" +This example demonstrates optimal browser usage patterns in Crawl4AI: +1. Sequential crawling with session reuse +2. Parallel crawling with browser instance reuse +3. Performance optimization settings +""" + +import asyncio +import os +from typing import List +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + + +async def crawl_sequential(urls: List[str]): + """ + Sequential crawling using session reuse - most efficient for moderate workloads + """ + print("\n=== Sequential Crawling with Session Reuse ===") + + # Configure browser with optimized settings + browser_config = BrowserConfig( + headless=True, + browser_args=[ + "--disable-gpu", # Disable GPU acceleration + "--disable-dev-shm-usage", # Disable /dev/shm usage + "--no-sandbox", # Required for Docker + ], + viewport={ + "width": 800, + "height": 600, + }, # Smaller viewport for better performance + ) + + # Configure crawl settings + crawl_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + # content_filter=PruningContentFilter(), In case you need fit_markdown + ), + ) + + # Create single crawler instance + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + session_id = "session1" # Use same session for all URLs + for url in urls: + result = await crawler.arun( + url=url, + config=crawl_config, + session_id=session_id, # Reuse same browser tab + ) + if result.success: + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown_v2.raw_markdown)}") + finally: + await crawler.close() + + +async def crawl_parallel(urls: List[str], max_concurrent: int = 3): + """ + Parallel crawling while reusing browser instance - best for large workloads + """ + print("\n=== Parallel Crawling with Browser Reuse ===") + + browser_config = BrowserConfig( + headless=True, + browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], + viewport={"width": 800, "height": 600}, + ) + + crawl_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + # content_filter=PruningContentFilter(), In case you need fit_markdown + ), + ) + + # Create single crawler instance for all parallel tasks + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + # Create tasks in batches to control concurrency + for i in range(0, len(urls), max_concurrent): + batch = urls[i : i + max_concurrent] + tasks = [] + + for j, url in enumerate(batch): + session_id = ( + f"parallel_session_{j}" # Different session per concurrent task + ) + task = crawler.arun(url=url, config=crawl_config, session_id=session_id) + tasks.append(task) + + # Wait for batch to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + for url, result in zip(batch, results): + if isinstance(result, Exception): + print(f"Error crawling {url}: {str(result)}") + elif result.success: + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown_v2.raw_markdown)}") + finally: + await crawler.close() + + +async def main(): + # Example URLs + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + "https://example.com/page4", + ] + + # Demo sequential crawling + await crawl_sequential(urls) + + # Demo parallel crawling + await crawl_parallel(urls, max_concurrent=2) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/extraction_strategies_example.py b/docs/examples/extraction_strategies_example.py new file mode 100644 index 00000000..348b891e --- /dev/null +++ b/docs/examples/extraction_strategies_example.py @@ -0,0 +1,115 @@ +""" +Example demonstrating different extraction strategies with various input formats. +This example shows how to: +1. Use different input formats (markdown, HTML, fit_markdown) +2. Work with JSON-based extractors (CSS and XPath) +3. Use LLM-based extraction with different input formats +4. Configure browser and crawler settings properly +""" + +import asyncio +import os +from typing import Dict, Any + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import ( + LLMExtractionStrategy, + JsonCssExtractionStrategy, + JsonXPathExtractionStrategy +) +from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str): + """Helper function to run extraction with proper configuration""" + try: + # Configure the crawler run settings + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() # For fit_markdown support + ) + ) + + # Run the crawler + result = await crawler.arun(url=url, config=config) + + if result.success: + print(f"\n=== {name} Results ===") + print(f"Extracted Content: {result.extracted_content}") + print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}") + print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}") + else: + print(f"Error in {name}: Crawl failed") + + except Exception as e: + print(f"Error in {name}: {str(e)}") + +async def main(): + # Example URL (replace with actual URL) + url = "https://example.com/product-page" + + # Configure browser settings + browser_config = BrowserConfig( + headless=True, + verbose=True + ) + + # Initialize extraction strategies + + # 1. LLM Extraction with different input formats + markdown_strategy = LLMExtractionStrategy( + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information including name, price, and description" + ) + + html_strategy = LLMExtractionStrategy( + input_format="html", + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information from HTML including structured data" + ) + + fit_markdown_strategy = LLMExtractionStrategy( + input_format="fit_markdown", + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information from cleaned markdown" + ) + + # 2. JSON CSS Extraction (automatically uses HTML input) + css_schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h1.product-title", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "description", "selector": ".description", "type": "text"} + ] + } + css_strategy = JsonCssExtractionStrategy(schema=css_schema) + + # 3. JSON XPath Extraction (automatically uses HTML input) + xpath_schema = { + "baseSelector": "//div[@class='product']", + "fields": [ + {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"}, + {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"}, + {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"} + ] + } + xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema) + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Run all strategies + await run_extraction(crawler, url, markdown_strategy, "Markdown LLM") + await run_extraction(crawler, url, html_strategy, "HTML LLM") + await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM") + await run_extraction(crawler, url, css_strategy, "CSS Extraction") + await run_extraction(crawler, url, xpath_strategy, "XPath Extraction") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md index 1afc24ba..8522675c 100644 --- a/docs/examples/full_page_screenshot_and_pdf_export.md +++ b/docs/examples/full_page_screenshot_and_pdf_export.md @@ -39,8 +39,8 @@ async def main(): f.write(b64decode(result.screenshot)) # Save PDF - if result.pdf_data: - pdf_bytes = b64decode(result.pdf_data) + if result.pdf: + pdf_bytes = b64decode(result.pdf) with open(os.path.join(__location__, "page.pdf"), "wb") as f: f.write(pdf_bytes) diff --git a/docs/llm.txt/10_file_download.q.md b/docs/llm.txt/10_file_download.q.md index 72be37a1..987149e7 100644 --- a/docs/llm.txt/10_file_download.q.md +++ b/docs/llm.txt/10_file_download.q.md @@ -1,63 +1,10 @@ -### Hypothetical Questions - -1. **Enabling Downloads** - - *"How do I configure Crawl4AI to allow file downloads during a crawl?"* - - *"Where in my code should I set `accept_downloads=True` to enable downloads?"* - -2. **Specifying the Download Location** - - *"How can I choose a custom directory for storing downloaded files?"* - - *"What is the default download directory if I don’t specify one?"* - -3. **Triggering Downloads from Pages** - - *"How do I simulate a click on a download link or button to initiate file downloads?"* - - *"Can I use JavaScript injection (`js_code`) to trigger downloads from the webpage elements?"* - - *"What does `wait_for` do, and how do I use it to ensure the download starts before proceeding?"* - -4. **Accessing Downloaded Files** - - *"Where can I find the paths to the files that I’ve downloaded?"* - - *"How do I check if any files were downloaded after my crawl completes?"* - -5. **Multiple Downloads** - - *"How do I handle scenarios where multiple files need to be downloaded sequentially?"* - - *"Can I introduce delays between file downloads to prevent server overload?"* - -6. **Error Handling and Reliability** - - *"What if the files I expect to download don’t appear or the links are broken?"* - - *"How can I handle incorrect paths, nonexistent directories, or failed downloads gracefully?"* - -7. **Timing and Performance** - - *"When should I use `wait_for` and how do I choose an appropriate delay?"* - - *"Can I start the download and continue processing other tasks concurrently?"* - -8. **Security Considerations** - - *"What precautions should I take with downloaded files?"* - - *"How can I ensure that downloaded files are safe before processing them further?"* - -9. **Integration with Other Crawl4AI Features** - - *"Can I combine file downloading with other extraction strategies or LLM-based processes?"* - - *"How do I manage downloads when running multiple parallel crawls?"* - -### Topics Discussed in the File - -- **Enabling Downloads in Crawl4AI**: - Configure the crawler through `BrowserConfig` or `CrawlerRunConfig` to allow file downloads. - -- **Download Locations**: - Specify a custom `downloads_path` or rely on the default directory (`~/.crawl4ai/downloads`). - -- **Triggering File Downloads**: - Use JavaScript code injection (`js_code`) to simulate user interactions (e.g., clicking a download link). Employ `wait_for` to allow time for downloads to initiate. - -- **Accessing Downloaded Files**: - After the crawl, `result.downloaded_files` provides a list of paths to the downloaded files. Use these paths to verify file sizes or further process the files. - -- **Handling Multiple Files**: - Loop through downloadable elements on the page, introduce delays, and wait for downloads to complete before proceeding. - -- **Error and Timing Considerations**: - Manage potential errors when downloads fail or timing issues arise. Adjust `wait_for` and error handling logic to ensure stable and reliable file retrievals. - -- **Security Precautions**: - Always verify the integrity and safety of downloaded files before using them in your application. - -In summary, the file explains how to set up, initiate, and manage file downloads within the Crawl4AI framework, including specifying directories, triggering downloads programmatically, handling multiple files, and accessing downloaded results. It also covers timing, error handling, and security best practices. \ No newline at end of file +enable_downloads: Downloads must be enabled using accept_downloads parameter in BrowserConfig or CrawlerRunConfig | download settings, enable downloads | BrowserConfig(accept_downloads=True) +download_location: Set custom download directory using downloads_path in BrowserConfig, defaults to .crawl4ai/downloads | download folder, save location | BrowserConfig(downloads_path="/path/to/downloads") +download_trigger: Trigger downloads using js_code in CrawlerRunConfig to simulate click actions | download button, click download | CrawlerRunConfig(js_code="document.querySelector('a[download]').click()") +download_timing: Control download timing using wait_for parameter in CrawlerRunConfig | download wait, timeout | CrawlerRunConfig(wait_for=5) +access_downloads: Access downloaded files through downloaded_files attribute in CrawlResult | download results, file paths | result.downloaded_files +multiple_downloads: Download multiple files by clicking multiple download links with delay | batch download, multiple files | js_code="const links = document.querySelectorAll('a[download]'); for(const link of links) { link.click(); }" +download_verification: Check download success by examining downloaded_files list and file sizes | verify downloads, file check | if result.downloaded_files: print(os.path.getsize(file_path)) +browser_context: Downloads are managed within browser context and require proper js_code targeting | download management, browser scope | CrawlerRunConfig(js_code="...") +error_handling: Handle failed downloads and incorrect paths for robust download management | download errors, error handling | try-except around download operations +security_consideration: Scan downloaded files for security threats before use | security check, virus scan | No direct code reference \ No newline at end of file diff --git a/docs/llm.txt/11_page_interaction.q.md b/docs/llm.txt/11_page_interaction.q.md index e469947f..a28e5b11 100644 --- a/docs/llm.txt/11_page_interaction.q.md +++ b/docs/llm.txt/11_page_interaction.q.md @@ -1,64 +1,10 @@ -Below is a structured list of hypothetical questions derived from the file’s content, followed by a bullet-point summary of key topics discussed. - -### Hypothetical Questions - -1. **JavaScript Execution Basics** - - *"How do I inject a single JavaScript command into the page using Crawl4AI?"* - - *"Can I run multiple JavaScript commands sequentially before extracting content?"* - -2. **Waiting for Conditions** - - *"How can I wait for a particular CSS element to appear before extracting data?"* - - *"Is there a way to wait for a custom JavaScript condition, like a minimum number of items to load?"* - -3. **Handling Dynamic Content** - - *"How do I deal with infinite scrolling or 'Load More' buttons to continuously fetch new data?"* - - *"Can I simulate user interactions (clicking buttons, scrolling) to reveal more content?"* - -4. **Form Interactions** - - *"How can I fill out and submit a form on a webpage using JavaScript injection?"* - - *"What if I need to handle multiple form fields or a multi-step submission process?"* - -5. **Timing Control and Delays** - - *"How can I set a page load timeout or introduce a delay before extracting the final HTML?"* - - *"When should I adjust `delay_before_return_html` to ensure the page is fully rendered?"* - -6. **Complex Interactions** - - *"How do I chain multiple interactions, like accepting cookies, scrolling, and then clicking 'Load More' several times?"* - - *"Can I maintain a session to continue interacting with the page across multiple steps?"* - -7. **Integration with Extraction Strategies** - - *"How do I combine JavaScript-based interactions with a structured extraction strategy like `JsonCssExtractionStrategy`?"* - - *"Is it possible to use LLM-based extraction after dynamically revealing more content?"* - -8. **Troubleshooting Interactions** - - *"What if my JavaScript code fails or the element I want to interact with isn’t available?"* - - *"How can I verify that the dynamic content I triggered actually loaded before extraction?"* - -9. **Performance and Reliability** - - *"Do I need to consider timeouts and backoffs when dealing with heavily dynamic pages?"* - - *"How can I ensure that my JS-based interactions do not slow down the extraction process unnecessarily?"* - -### Topics Discussed in the File - -- **JavaScript Execution**: - Injecting single or multiple JS commands into the page to manipulate scrolling, clicks, or form submissions. - -- **Waiting Mechanisms**: - Using `wait_for` with CSS selectors (`"css:.some-element"`) or custom JavaScript conditions (`"js:() => {...}"`) to ensure the page is in the desired state before extraction. - -- **Dynamic Content Handling**: - Techniques for infinite scrolling, load more buttons, and other elements that reveal additional data after user-like interactions. - -- **Form Interaction**: - Filling out form fields, submitting forms, and waiting for results to appear. - -- **Timing Control**: - Setting page timeouts, introducing delays before returning HTML, and ensuring stable and complete extractions. - -- **Complex Interactions**: - Combining multiple steps (cookie acceptance, infinite scroll, load more clicks) and maintaining sessions across multiple steps for fully dynamic pages. - -- **Integration with Extraction Strategies**: - Applying pattern-based (CSS/JSON) or LLM-based extraction after performing required interactions to reveal the content of interest. - -In summary, the file provides detailed guidance on interacting with dynamic pages in Crawl4AI. It shows how to run JavaScript commands, wait for certain conditions, handle infinite scroll or complex user interactions, and integrate these techniques with content extraction strategies. \ No newline at end of file +javascript_execution: Execute single or multiple JavaScript commands in webpage | js code, javascript commands, browser execution | CrawlerRunConfig(js_code="window.scrollTo(0, document.body.scrollHeight);") +css_wait: Wait for specific CSS elements to appear on page | css selector, element waiting, dynamic content | CrawlerRunConfig(wait_for="css:.dynamic-content") +js_wait_condition: Define custom JavaScript wait conditions for dynamic content | javascript waiting, conditional wait, custom conditions | CrawlerRunConfig(wait_for="js:() => document.querySelectorAll('.item').length > 10") +infinite_scroll: Handle infinite scroll and load more buttons | pagination, dynamic loading, scroll handling | CrawlerRunConfig(js_code="window.scrollTo(0, document.body.scrollHeight);") +form_interaction: Fill and submit forms using JavaScript | form handling, input filling, form submission | CrawlerRunConfig(js_code="document.querySelector('#search').value = 'search term';") +timing_control: Set page timeouts and delays before content capture | page timing, delays, timeouts | CrawlerRunConfig(page_timeout=60000, delay_before_return_html=2.0) +session_management: Maintain browser session for multiple interactions | session handling, browser state, session cleanup | crawler.crawler_strategy.kill_session(session_id) +cookie_consent: Handle cookie consent popups and notifications | cookie handling, popup management | CrawlerRunConfig(js_code="document.querySelector('.cookie-accept')?.click();") +extraction_combination: Combine page interactions with structured data extraction | data extraction, content parsing | JsonCssExtractionStrategy(schema), LLMExtractionStrategy(schema) +dynamic_content_loading: Wait for and verify dynamic content loading | content verification, dynamic loading | wait_for="js:() => document.querySelector('.content').innerText.length > 100" \ No newline at end of file diff --git a/docs/llm.txt/12_prefix_based_input.q.md b/docs/llm.txt/12_prefix_based_input.q.md index 2e6d4f03..7ff392bd 100644 --- a/docs/llm.txt/12_prefix_based_input.q.md +++ b/docs/llm.txt/12_prefix_based_input.q.md @@ -1,56 +1,10 @@ -### Hypothetical Questions - -1. **Basic Usage** - - *"How can I crawl a regular website URL using Crawl4AI?"* - - *"What configuration object do I need to pass to `arun` for basic crawling scenarios?"* - -2. **Local HTML Files** - - *"How do I crawl an HTML file stored locally on my machine?"* - - *"What prefix should I use when specifying a local file path to `arun`?"* - -3. **Raw HTML Strings** - - *"Is it possible to crawl a raw HTML string without saving it to a file first?"* - - *"How do I prefix a raw HTML string so that Crawl4AI treats it like HTML content?"* - -4. **Verifying Results** - - *"Can I compare the extracted Markdown content from a live page with that of a locally saved or raw version to ensure they match?"* - - *"How do I handle errors or check if the crawl was successful?"* - -5. **Use Cases** - - *"When would I want to use `file://` vs. `raw:` URLs?"* - - *"Can I reuse the same code structure for various input types (web URL, file, raw HTML)?"* - -6. **Caching and Configuration** - - *"What does `bypass_cache=True` do and when should I use it?"* - - *"Is there a simpler way to configure crawling options uniformly across web URLs, local files, and raw HTML?"* - -7. **Practical Scenarios** - - *"How can I integrate file-based crawling into a pipeline that starts from a live page, saves the HTML, and then crawls that local file for consistency checks?"* - - *"Does Crawl4AI’s prefix-based handling allow me to pre-process raw HTML (e.g., downloaded from another source) without hosting it on a local server?"* - -### Topics Discussed in the File - -- **Prefix-Based Input Handling**: - Introducing the concept of using `http://` or `https://` for web URLs, `file://` for local files, and `raw:` for direct HTML strings. This unified approach allows seamless handling of different content sources within Crawl4AI. - -- **Crawling a Web URL**: - Demonstrating how to crawl a live web page (like a Wikipedia article) using `AsyncWebCrawler` and `CrawlerRunConfig`. - -- **Crawling a Local HTML File**: - Showing how to convert a local file path to a `file://` URL and use `arun` to process it, ensuring that previously saved HTML can be re-crawled for verification or offline analysis. - -- **Crawling Raw HTML Content**: - Explaining how to directly pass an HTML string prefixed with `raw:` to `arun`, enabling quick tests or processing of HTML code obtained from other sources without saving it to disk. - -- **Consistency and Verification**: - Providing a comprehensive example that: - 1. Crawls a live Wikipedia page. - 2. Saves the HTML to a file. - 3. Re-crawls the local file. - 4. Re-crawls the content as a raw HTML string. - 5. Verifies that the Markdown extracted remains consistent across all three methods. - -- **Integration with `CrawlerRunConfig`**: - Showing how to use `CrawlerRunConfig` to disable caching (`bypass_cache=True`) and ensure fresh results for each test run. - -In summary, the file highlights how to use Crawl4AI’s prefix-based handling to effortlessly switch between crawling live web pages, local HTML files, and raw HTML strings. It also demonstrates a detailed workflow for verifying consistency and correctness across various input methods. \ No newline at end of file +url_prefix_handling: Crawl4AI supports different URL prefixes for various input types | input handling, url format, crawling types | url="https://example.com" or "file://path" or "raw:html" +web_crawling: Crawl live web pages using http:// or https:// prefixes with AsyncWebCrawler | web scraping, url crawling, web content | AsyncWebCrawler().arun(url="https://example.com") +local_file_crawling: Access local HTML files using file:// prefix for crawling | local html, file crawling, file access | AsyncWebCrawler().arun(url="file:///path/to/file.html") +raw_html_crawling: Process raw HTML content directly using raw: prefix | html string, raw content, direct html | AsyncWebCrawler().arun(url="raw:content") +crawler_config: Configure crawling behavior using CrawlerRunConfig object | crawler settings, configuration, bypass cache | CrawlerRunConfig(bypass_cache=True) +async_context: AsyncWebCrawler should be used within async context manager | async with, context management, async programming | async with AsyncWebCrawler() as crawler +crawl_result: Crawler returns result object containing success status, markdown and error messages | response handling, crawl output, result parsing | result.success, result.markdown, result.error_message +html_to_markdown: Crawler automatically converts HTML content to markdown format | format conversion, markdown generation, content processing | result.markdown +error_handling: Check crawl success status and handle error messages appropriately | error checking, failure handling, status verification | if result.success: ... else: print(result.error_message) +content_verification: Compare markdown length between different crawling methods for consistency | content validation, length comparison, consistency check | assert web_crawl_length == local_crawl_length \ No newline at end of file diff --git a/docs/llm.txt/13_hooks_auth.q.md b/docs/llm.txt/13_hooks_auth.q.md index 266a5278..c269b9fc 100644 --- a/docs/llm.txt/13_hooks_auth.q.md +++ b/docs/llm.txt/13_hooks_auth.q.md @@ -1,58 +1,12 @@ -Below is a structured list of hypothetical questions derived from the file’s content, followed by a bullet-point summary of key topics discussed. - -### Hypothetical Questions - -1. **General Hook Usage** - - *"What are hooks in Crawl4AI, and how do they help customize the crawling process?"* - - *"Which stages of the crawling lifecycle can I attach hooks to?"* - -2. **Specific Hooks** - - *"What does the `on_browser_created` hook allow me to do?"* - - *"How can I use the `on_page_context_created` hook to modify requests before navigation?"* - - *"When should I use `before_goto` and `after_goto` hooks?"* - - *"How does `on_execution_started` help with custom JavaScript execution?"* - - *"What kind of preprocessing can I do in `before_return_html`?"* - -3. **Authentication and Customization** - - *"How can I perform authentication (like logging in) before actual crawling begins?"* - - *"Can I set cookies, headers, or modify requests using hooks?"* - -4. **Error Handling and Debugging** - - *"If my hooks fail or raise errors, how is that handled during the crawling process?"* - - *"How can I use hooks to troubleshoot issues, like blocking image requests or logging console messages?"* - -5. **Complex Scenarios** - - *"Can I combine multiple hooks to handle complex workflows like login, script execution, and dynamic content blocking?"* - - *"Is it possible to add conditional logic in hooks to treat certain URLs differently?"* - -6. **Performance and Reliability** - - *"Do these hooks run asynchronously, and how does that affect the crawler’s performance?"* - - *"Can I cancel requests or actions via hooks to improve efficiency?"* - -7. **Integration with `BrowserConfig` and `CrawlerRunConfig`** - - *"How do I use `BrowserConfig` and `CrawlerRunConfig` in tandem with hooks?"* - - *"Does setting hooks require changes to the configuration objects or can I apply them at runtime?"* - -### Topics Discussed in the File - -- **Hooks in `AsyncWebCrawler`**: - Hooks are asynchronous callback functions triggered at key points in the crawling lifecycle. They allow advanced customization, such as modifying browser/page contexts, injecting scripts, or altering network requests. - -- **Hook Types and Purposes**: - - **`on_browser_created`**: Initialize browser state, handle authentication (login), set cookies. - - **`on_page_context_created`**: Set up request routing, block resources, or modify requests before navigation. - - **`before_goto`**: Add or modify HTTP headers, prepare the page before actually navigating to the target URL. - - **`after_goto`**: Verify the current URL, log details, or ensure that page navigation succeeded. - - **`on_execution_started`**: Perform actions right after JS execution, like logging console output or checking state. - - **`before_return_html`**: Analyze, log, or preprocess the extracted HTML before it’s returned. - -- **Practical Examples**: - Demonstrations of handling authentication via `on_browser_created`, blocking images using `on_page_context_created` with a custom routing function, adding HTTP headers in `before_goto`, and logging content details in `before_return_html`. - -- **Integration with Configuration Objects**: - Using `BrowserConfig` for initial browser settings and `CrawlerRunConfig` for specifying JavaScript code, wait conditions, and more, then combining them with hooks for a fully customizable crawling workflow. - -- **Asynchronous and Flexible**: - Hooks are async, fitting seamlessly into the event-driven model of crawling. They can abort requests, continue them, or conditionally modify behavior based on URL patterns. - -In summary, this file explains how to use hooks in Crawl4AI’s `AsyncWebCrawler` to customize nearly every aspect of the crawling process. By attaching hooks at various lifecycle stages, developers can implement authentication routines, block certain types of requests, tweak headers, run custom JS, and analyze the final HTML—all while maintaining control and flexibility. \ No newline at end of file +crawler_hooks: AsyncWebCrawler supports customizable hooks for modifying crawler behavior | hooks, async functions, crawler customization | crawler.crawler_strategy.set_hook() +browser_creation_hook: on_browser_created hook executes when browser is initialized for authentication and setup | browser setup, login, authentication | async def on_browser_created(browser: Browser, **kwargs) +page_context_hook: on_page_context_created hook handles routing and initial page setup | page context, routing, resource blocking | async def on_page_context_created(context: BrowserContext, page: Page, **kwargs) +navigation_pre_hook: before_goto hook allows adding custom headers before URL navigation | headers, pre-navigation, request modification | async def before_goto(page: Page, context: BrowserContext, **kwargs) +navigation_post_hook: after_goto hook executes after URL navigation for verification | post-navigation, URL logging | async def after_goto(page: Page, context: BrowserContext, **kwargs) +js_execution_hook: on_execution_started hook runs after custom JavaScript execution | JavaScript, script execution | async def on_execution_started(page: Page, context: BrowserContext, **kwargs) +html_processing_hook: before_return_html hook processes HTML content before returning | HTML content, preprocessing | async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs) +browser_configuration: BrowserConfig allows setting headless mode and viewport dimensions | browser settings, viewport | BrowserConfig(headless=True, viewport_width=1920, viewport_height=1080) +crawler_configuration: CrawlerRunConfig defines JavaScript execution and wait conditions | crawler settings, JS code, wait conditions | CrawlerRunConfig(js_code="window.scrollTo(0)", wait_for="footer") +resource_management: Route handlers can block or modify specific resource types | resource blocking, request handling | if route.request.resource_type == "image": await route.abort() +authentication_flow: Browser authentication handled through login form interaction and cookie setting | login process, cookies | await page.fill("input[name='username']", "testuser") +hook_registration: Hooks are registered using the crawler strategy's set_hook method | hook setup, strategy | crawler.crawler_strategy.set_hook("hook_name", hook_function) \ No newline at end of file diff --git a/docs/llm.txt/14_proxy_security.q.md b/docs/llm.txt/14_proxy_security.q.md index c7c8bc70..1489d277 100644 --- a/docs/llm.txt/14_proxy_security.q.md +++ b/docs/llm.txt/14_proxy_security.q.md @@ -1,53 +1,8 @@ -### Hypothetical Questions - -1. **Basic Proxy Configuration** - - *"How do I set a basic HTTP proxy for the crawler?"* - - *"Can I use a SOCKS proxy instead of an HTTP proxy?"* - -2. **Authenticated Proxies** - - *"How do I provide a username and password for an authenticated proxy server?"* - - *"What is the `proxy_config` dictionary, and how do I use it?"* - -3. **Rotating Proxies** - - *"How can I dynamically change the proxy server for each request?"* - - *"What patterns or logic can I implement to rotate proxies from a pool?"* - -4. **Custom Headers for Security and Anonymity** - - *"How do I set custom HTTP headers in `BrowserConfig` to appear more human-like or meet security policies?"* - - *"Can I add headers like `X-Forwarded-For`, `Accept-Language`, or `Cache-Control`?"* - -5. **Combining Proxies with Magic Mode** - - *"What is Magic Mode, and how does it help with anti-detection features?"* - - *"Can I use Magic Mode in combination with proxies and custom headers for better anonymity?"* - -6. **Troubleshooting and Edge Cases** - - *"What if my authenticated proxy doesn’t accept credentials?"* - - *"How do I handle errors when switching proxies mid-crawl?"* - -7. **Performance and Reliability** - - *"Does using a proxy slow down the crawling process?"* - - *"How do I ensure stable and fast connections when rotating proxies frequently?"* - -8. **Integration with Other Crawl4AI Features** - - *"Can I use proxy configurations with hooks, caching, or LLM extraction strategies?"* - - *"How do I integrate proxy-based crawling into a larger pipeline that includes data extraction and content filtering?"* - - -### Topics Discussed in the File - -- **Proxy Configuration**: - Shows how to set an HTTP or SOCKS proxy in `BrowserConfig` for the crawler, enabling you to route traffic through a specific server. - -- **Authenticated Proxies**: - Demonstrates how to provide username and password credentials to access proxy servers that require authentication. - -- **Rotating Proxies**: - Suggests a pattern for dynamically updating proxy settings before each request, allowing you to cycle through multiple proxies to avoid throttling or blocking. - -- **Custom Headers**: - Explains how to add custom HTTP headers in `BrowserConfig` for security, anonymity, or compliance with certain websites’ requirements. - -- **Integration with Magic Mode**: - Shows how to combine proxy usage, custom headers, and Magic Mode (`magic=True` in `CrawlerRunConfig`) to enhance anti-detection measures, making it harder for websites to detect automated crawlers. - -In summary, the file explains how to configure proxies (including authenticated proxies), rotate them dynamically, set custom headers for extra security and privacy, and combine these techniques with Magic Mode for robust anti-detection strategies in Crawl4AI. \ No newline at end of file +proxy_setup: Configure basic proxy in Crawl4AI using BrowserConfig with proxy URL | proxy configuration, proxy setup, basic proxy | BrowserConfig(proxy="http://proxy.example.com:8080") +socks_proxy: Use SOCKS proxy protocol for web crawling | SOCKS5, proxy protocol, SOCKS connection | BrowserConfig(proxy="socks5://proxy.example.com:1080") +authenticated_proxy: Set up proxy with username and password authentication | proxy auth, proxy credentials, authenticated connection | BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080", "username": "user", "password": "pass"}) +rotating_proxies: Implement dynamic proxy rotation during crawling | proxy rotation, proxy switching, dynamic proxies | browser_config.proxy_config = await get_next_proxy() +custom_headers: Add security headers to browser configuration for enhanced protection | HTTP headers, request headers, security headers | BrowserConfig(headers={"X-Forwarded-For": "203.0.113.195", "Accept-Language": "en-US,en;q=0.9"}) +magic_mode: Combine proxy settings with Magic Mode for maximum anti-detection | anti-detection, stealth mode, protection features | CrawlerRunConfig(magic=True) with BrowserConfig(proxy="http://proxy.example.com:8080") +crawler_context: Use AsyncWebCrawler with async context manager for proper resource management | async crawler, context manager, crawler setup | async with AsyncWebCrawler(config=browser_config) as crawler +cache_control: Set cache control headers to prevent caching during crawling | caching headers, no-cache, cache prevention | BrowserConfig(headers={"Cache-Control": "no-cache", "Pragma": "no-cache"}) \ No newline at end of file diff --git a/docs/llm.txt/15_screenshot_and_pdf_export.md b/docs/llm.txt/15_screenshot_and_pdf_export.md index 61f29348..4dcc3ff1 100644 --- a/docs/llm.txt/15_screenshot_and_pdf_export.md +++ b/docs/llm.txt/15_screenshot_and_pdf_export.md @@ -39,8 +39,8 @@ async def main(): f.write(b64decode(result.screenshot)) # Save PDF - if result.pdf_data: - pdf_bytes = b64decode(result.pdf_data) + if result.pdf: + pdf_bytes = b64decode(result.pdf) with open(os.path.join(__location__, "page.pdf"), "wb") as f: f.write(pdf_bytes) diff --git a/docs/llm.txt/15_screenshot_and_pdf_export.q.md b/docs/llm.txt/15_screenshot_and_pdf_export.q.md index 921ace9a..04e466d7 100644 --- a/docs/llm.txt/15_screenshot_and_pdf_export.q.md +++ b/docs/llm.txt/15_screenshot_and_pdf_export.q.md @@ -1,50 +1,9 @@ -Below is a structured list of hypothetical questions derived from the file’s content, followed by a bullet-point summary of key topics discussed. - -### Hypothetical Questions - -1. **Motivation and Use Cases** - - *"Why should I use the PDF-based screenshot approach for very long web pages?"* - - *"What are the benefits of generating a PDF before converting it to an image?"* - -2. **Workflow and Technical Process** - - *"How does Crawl4AI generate a PDF and then convert it into a screenshot?"* - - *"Do I need to manually scroll or stitch images to capture large pages?"* - -3. **Practical Steps** - - *"What code do I need to write to request both a PDF and a screenshot in one crawl?"* - - *"How do I save the resulting PDF and screenshot to disk?"* - -4. **Performance and Reliability** - - *"Will this PDF-based method time out or fail for extremely long pages?"* - - *"Is this approach faster or more memory-efficient than traditional full-page screenshots?"* - -5. **Additional Features and Customization** - - *"Can I save only the PDF without generating a screenshot?"* - - *"If I have a PDF, can I easily convert it to multiple images or just the first page?"* - -6. **Integration with Other Crawl4AI Features** - - *"Can I combine PDF/screenshot generation with other Crawl4AI extraction strategies or hooks?"* - - *"Is caching or proxying affected by PDF or screenshot generation?" - -7. **Troubleshooting** - - *"What should I do if the screenshot or PDF does not appear in the result?"* - - *"How do I handle large PDF sizes or slow saves when dealing with massive pages?"* - -### Topics Discussed in the File - -- **New Approach to Large Page Screenshots**: - The document introduces a method to first export a page as a PDF using the browser’s built-in PDF rendering capabilities and then convert that PDF to an image if a screenshot is requested. - -- **Advantages Over Traditional Methods**: - This approach avoids timeouts, memory issues, and the complexity of stitching multiple images for extremely long pages. The PDF rendering is stable, reliable, and does not require the crawler to scroll through the entire page. - -- **One-Stop Solution**: - By enabling `pdf=True` and `screenshot=True`, you receive both the full-page PDF and a screenshot (converted from the PDF) in a single crawl. This reduces repetitive processes and complexity. - -- **How to Implement**: - Demonstrates code usage with `arun` to request both the PDF and screenshot, and how to save them to files. Explains that if a PDF is already generated, the screenshot is derived directly from it, simplifying the workflow. - -- **Integration and Efficiency**: - Compatible with other Crawl4AI features like caching and extraction strategies. Simplifies large-scale crawling pipelines needing both a textual representation (HTML extraction) and visual confirmations (PDF/screenshot). - -In summary, the file outlines a new feature for capturing full-page screenshots of massive web pages by first generating a stable, reliable PDF, then converting it into an image. This technique eliminates previous issues related to large content pages, ensuring smoother performance and simpler code maintenance. \ No newline at end of file +page_capture: Full-page screenshots and PDFs can be generated for massive webpages using Crawl4AI | webpage capture, full page screenshot, pdf export | AsyncWebCrawler().arun(url=url, pdf=True, screenshot=True) +pdf_approach: Pages are first exported as PDF then converted to high-quality images for better handling of large content | pdf conversion, image export, page rendering | result.pdf, result.screenshot +export_benefits: PDF export method never times out and works with any page length | timeout handling, page size limits, reliability | pdf=True +dual_output: Get both PDF and screenshot in single crawl without reloading | multiple formats, single pass, efficient capture | pdf=True, screenshot=True +result_handling: Screenshot and PDF data are returned as base64 encoded strings | base64 encoding, binary data, file saving | b64decode(result.screenshot), b64decode(result.pdf) +cache_control: Cache mode can be bypassed for fresh page captures | caching, fresh content, bypass cache | cache_mode=CacheMode.BYPASS +async_operation: Crawler operates asynchronously using Python's asyncio framework | async/await, concurrent execution | async with AsyncWebCrawler() as crawler +file_saving: Screenshots and PDFs can be saved directly to local files | file output, save results, local storage | open("screenshot.png", "wb"), open("page.pdf", "wb") +error_handling: Success status can be checked before processing results | error checking, result validation | if result.success: \ No newline at end of file diff --git a/docs/llm.txt/16_storage_state.q.md b/docs/llm.txt/16_storage_state.q.md new file mode 100644 index 00000000..baf1bb1f --- /dev/null +++ b/docs/llm.txt/16_storage_state.q.md @@ -0,0 +1,10 @@ +storage_state_concept: Storage state preserves session data including cookies and localStorage across crawler runs | session persistence, state management | storage_state="mystate.json" +storage_state_formats: Storage state can be provided as either a dictionary or path to JSON file | state configuration, json format | storage_state={"cookies": [...], "origins": [...]} +cookie_structure: Cookies in storage state require name, value, domain, path, and expiration properties | cookie configuration, session cookies | "cookies": [{"name": "session", "value": "abcd1234", "domain": "example.com"}] +localstorage_structure: localStorage entries are organized by origin with name-value pairs | web storage, browser storage | "localStorage": [{"name": "token", "value": "my_auth_token"}] +authentication_preservation: Storage state enables starting crawls in authenticated state without repeating login flow | session management, login persistence | AsyncWebCrawler(storage_state="my_storage_state.json") +state_export: Browser context state can be exported to JSON file after successful login | session export, state saving | await context.storage_state(path="my_storage_state.json") +login_automation: Initial login can be performed using browser_created_hook to establish authenticated state | authentication automation, login process | on_browser_created_hook(browser) +persistent_context: Crawler supports persistent context with user data directory for maintaining state | browser persistence, session storage | use_persistent_context=True, user_data_dir="./my_user_data" +protected_content: Storage state enables direct access to protected content by preserving authentication tokens | authenticated access, protected pages | crawler.arun(url='https://example.com/protected') +state_reuse: Subsequent crawler runs can reuse saved storage state to skip authentication steps | session reuse, login bypass | AsyncWebCrawler(storage_state="my_storage_state.json") \ No newline at end of file diff --git a/docs/llm.txt/16_storage_state_q.md b/docs/llm.txt/16_storage_state_q.md deleted file mode 100644 index 241029bd..00000000 --- a/docs/llm.txt/16_storage_state_q.md +++ /dev/null @@ -1,52 +0,0 @@ -### Hypothetical Questions - -1. **Basic Concept of `storage_state`** - - *"What is `storage_state` and how does it help me maintain session data across crawls?"* - - *"Can I directly provide a dictionary of cookies and localStorage data, or do I need a file?"* - -2. **Cookies and LocalStorage Handling** - - *"How do I set cookies and localStorage items before starting my crawl?"* - - *"Can I specify multiple origins and different sets of localStorage keys per origin?"* - -3. **Using a `storage_state` File** - - *"How do I load session data from a JSON file?"* - - *"Can I export the current session state to a file and reuse it later?"* - -4. **Login and Authentication Scenarios** - - *"How can I use `storage_state` to skip the login process on subsequent runs?"* - - *"What’s the workflow for logging in once, exporting the session data, and then starting future crawls already logged in?"* - -5. **Updating or Changing the Session State** - - *"What if my session expires? Can I refresh the session and update the `storage_state` file?"* - - *"How can I revert to a 'logged out' state by clearing tokens or using a sign-out scenario?"* - -6. **Practical Use Cases** - - *"If I’m crawling a series of protected pages from the same site, how can `storage_state` speed up the process?"* - - *"Can I switch between multiple `storage_state` files for different accounts or different states (e.g., logged in vs. logged out)?"* - -7. **Performance and Reliability** - - *"Will using `storage_state` improve my crawl performance by reducing repeated actions?"* - - *"Are there any risks or complications when transferring `storage_state` between different environments?"* - -8. **Integration with Hooks and Configurations** - - *"How do I integrate `storage_state` with hooks for a one-time login flow?"* - - *"Can I still customize browser or page behavior with hooks if I start with a `storage_state`?"* - -### Topics Discussed in the File - -- **`storage_state` Overview**: - Explaining that `storage_state` is a mechanism to start crawls with preloaded cookies and localStorage data, eliminating the need to re-authenticate or re-set session data every time. - -- **Data Formats**: - You can provide `storage_state` as either a Python dictionary or a JSON file. The JSON structure includes cookies and localStorage entries associated with specific domains/origins. - -- **Practical Authentication Workflows**: - Demonstrating how to log in once (using a hook or manual interaction), then save the resulting `storage_state` to a file. Subsequent crawls can use this file to start already authenticated, greatly speeding up the process and simplifying pipelines. - -- **Updating or Changing State**: - The crawler can export the current session state to a file at any time. This allows reusing the same authenticated session, switching states, or returning to a baseline state (e.g., logged out) by applying a different `storage_state` file. - -- **Integration with Other Features**: - `storage_state` works seamlessly with `AsyncWebCrawler` and `CrawlerRunConfig`. You can still use hooks, JS code execution, and other Crawl4AI features alongside a preloaded session state. - -In summary, the file explains how to use `storage_state` to maintain and reuse session data (cookies, localStorage) across crawls in Crawl4AI, demonstrating how it streamlines workflows that require authentication or complex session setups. \ No newline at end of file diff --git a/docs/llm.txt/17_crawl_config.q.md b/docs/llm.txt/17_crawl_config.q.md new file mode 100644 index 00000000..96626848 --- /dev/null +++ b/docs/llm.txt/17_crawl_config.q.md @@ -0,0 +1,17 @@ +content_processing: Configure word count threshold for processing crawled content | minimum words, content length, processing threshold | word_count_threshold=200 +extraction_config: Set strategy for extracting structured data from pages | data extraction, content parsing, structured data | extraction_strategy=ExtractionStrategy() +chunking_setup: Configure content chunking strategy for processing | content splitting, text chunks, segmentation | chunking_strategy=RegexChunking() +content_filtering: Filter irrelevant content using RelevantContentFilter | content pruning, filtering, relevance | content_filter=RelevantContentFilter() +text_extraction: Extract only text content from web pages | text-only, content extraction, plain text | only_text=True +css_selection: Target specific page elements using CSS selectors | element selection, content targeting, DOM selection | css_selector=".main-content" +html_cleaning: Configure HTML tag exclusion and attribute handling | tag removal, attribute filtering, HTML cleanup | excluded_tags=["script", "style"], keep_data_attributes=True +caching_config: Control page caching behavior and session persistence | cache settings, session management, cache control | cache_mode=CacheMode.ENABLED, session_id="session1" +page_navigation: Configure page loading and navigation timing | page timeout, loading conditions, navigation settings | wait_until="domcontentloaded", page_timeout=60000 +request_timing: Set delays between multiple page requests | request delays, crawl timing, rate limiting | mean_delay=0.1, max_range=0.3 +concurrent_ops: Control number of concurrent crawling operations | concurrency, parallel requests, semaphore | semaphore_count=5 +page_interaction: Configure JavaScript execution and page scanning | JS execution, page scanning, user simulation | js_code="window.scrollTo(0,1000)", scan_full_page=True +popup_handling: Manage overlay elements and popup removal | overlay removal, popup handling, anti-popup | remove_overlay_elements=True, magic=True +media_capture: Configure screenshot and PDF generation settings | screenshots, PDF export, media capture | screenshot=True, pdf=True +image_processing: Set thresholds for image processing and description | image handling, description extraction, image scoring | image_score_threshold=3, image_description_min_word_threshold=50 +link_filtering: Configure domain and link exclusion rules | domain filtering, link exclusion, URL filtering | exclude_external_links=True, exclude_domains=["example.com"] +debug_settings: Control logging and debugging output | logging, debugging, console output | verbose=True, log_console=True \ No newline at end of file diff --git a/docs/llm.txt/1_introduction.ex.md b/docs/llm.txt/1_introduction.md similarity index 99% rename from docs/llm.txt/1_introduction.ex.md rename to docs/llm.txt/1_introduction.md index dc6bb522..b2231c71 100644 --- a/docs/llm.txt/1_introduction.ex.md +++ b/docs/llm.txt/1_introduction.md @@ -330,9 +330,9 @@ if result.screenshot: with open("page.png", "wb") as f: f.write(result.screenshot) -if result.pdf_data: +if result.pdf: with open("page.pdf", "wb") as f: - f.write(result.pdf_data) + f.write(result.pdf) ``` **File Downloads:** diff --git a/docs/llm.txt/1_introduction.q.md b/docs/llm.txt/1_introduction.q.md new file mode 100644 index 00000000..9b147225 --- /dev/null +++ b/docs/llm.txt/1_introduction.q.md @@ -0,0 +1,16 @@ +installation: Install Crawl4AI using pip and setup required dependencies | package installation, setup guide | pip install crawl4ai && crawl4ai-setup && playwright install chromium +basic_usage: Create AsyncWebCrawler instance to extract web content into markdown | quick start, basic crawling | async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun("https://example.com") +browser_configuration: Configure browser settings like headless mode, viewport, and JavaScript | browser setup, chrome options | BrowserConfig(headless=True, viewport_width=1920, viewport_height=1080) +crawler_config: Set crawling parameters including selectors, timeouts and content filters | crawl settings, extraction config | CrawlerRunConfig(css_selector="article.main", page_timeout=60000) +markdown_extraction: Get different markdown formats including raw, cited and filtered versions | content extraction, markdown output | result.markdown_v2.raw_markdown, result.markdown_v2.markdown_with_citations +structured_extraction: Extract structured data using CSS or XPath selectors into JSON | data extraction, scraping | JsonCssExtractionStrategy(schema), JsonXPathExtractionStrategy(xpath_schema) +llm_extraction: Use LLM models to extract structured data with custom schemas | AI extraction, model integration | LLMExtractionStrategy(provider="ollama/nemotron", schema=ModelSchema) +dynamic_content: Handle JavaScript-driven content using custom JS code and wait conditions | dynamic pages, JS execution | run_config.js_code="window.scrollTo(0, document.body.scrollHeight);" +media_handling: Access extracted images, videos and audio with relevance scores | media extraction, asset handling | result.media["images"], result.media["videos"] +link_extraction: Get categorized internal and external links with context | link scraping, URL extraction | result.links["internal"], result.links["external"] +authentication: Preserve login state using user data directory or storage state | login, session handling | BrowserConfig(user_data_dir="/path/to/profile") +proxy_setup: Configure proxy settings with authentication for crawling | proxy configuration, network setup | browser_config.proxy_config={"server": "http://proxy.example.com:8080"} +content_capture: Save screenshots and PDFs of crawled pages | page capture, downloads | run_config.screenshot=True, run_config.pdf=True +caching: Enable result caching to improve performance | performance optimization, caching | run_config.cache_mode = CacheMode.ENABLED +custom_hooks: Add custom logic at different stages of the crawling process | event hooks, customization | crawler.crawler_strategy.set_hook("on_page_context_created", hook_function) +containerization: Run Crawl4AI in Docker with different architectures and GPU support | docker, deployment | docker pull unclecode/crawl4ai:basic-amd64 \ No newline at end of file diff --git a/docs/llm.txt/1_introduction.xs.q.md b/docs/llm.txt/1_introduction.xs.q.md new file mode 100644 index 00000000..55b8de4e --- /dev/null +++ b/docs/llm.txt/1_introduction.xs.q.md @@ -0,0 +1,13 @@ +installation: Install Crawl4AI using pip and run setup command | package installation, setup | pip install crawl4ai && crawl4ai-setup +playwright_setup: Install Chromium browser for Playwright if needed | browser installation, chromium setup | playwright install chromium +async_crawler: Create asynchronous web crawler instance with optional verbose logging | crawler initialization, async setup | AsyncWebCrawler(verbose=True) +basic_crawl: Perform basic asynchronous webpage crawl and get markdown output | single page crawl, basic usage | async with AsyncWebCrawler() as c: await c.arun(url="https://example.com") +concurrent_crawling: Crawl multiple URLs simultaneously using asyncio.gather | parallel crawling, multiple urls | asyncio.gather(*[c.arun(url=u) for u in urls]) +cache_configuration: Enable or disable cache mode for crawling | caching, cache settings | cache_mode=CacheMode.ENABLED +proxy_setup: Configure proxy settings for web crawler | proxy configuration, http proxy | proxies={"http": "http://user:pass@proxy:port"} +browser_config: Set custom headers and viewport dimensions | user agent, viewport size | headers={"User-Agent": "MyUA"}, viewport={"width":1024,"height":768} +javascript_injection: Inject custom JavaScript code during crawling | js injection, custom scripts | js_code=["""(async () => {...})();"""] +json_extraction: Extract data using JSON CSS extraction strategy | css extraction, json schema | JsonCssExtractionStrategy(schema) +llm_extraction: Configure LLM-based extraction with OpenAI integration | language model extraction, AI extraction | LLMExtractionStrategy(provider="openai/gpt-4o", api_token="KEY") +troubleshooting: Common issues include Playwright errors, empty output, and SSL problems | error handling, debugging | playwright install chromium, verify_ssl=False +documentation_links: Access additional resources through GitHub repository and official documentation | resources, links | github.com/unclecode/crawl4ai, crawl4ai.com/mkdocs/ \ No newline at end of file diff --git a/docs/llm.txt/23_common_issues.md b/docs/llm.txt/23_common_issues.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/llm.txt/2_configuration.q.md b/docs/llm.txt/2_configuration.q.md index 03793d53..83649bea 100644 --- a/docs/llm.txt/2_configuration.q.md +++ b/docs/llm.txt/2_configuration.q.md @@ -1,97 +1,20 @@ -### Hypothetical Questions - -**BrowserConfig:** - -1. **Browser Types and Headless Mode** - - *"How do I choose between `chromium`, `firefox`, or `webkit` for `browser_type`?"* - - *"What are the benefits of running the browser in `headless=True` mode versus a visible UI?"* - -2. **Managed Browser and Persistent Context** - - *"When should I enable `use_managed_browser` for advanced session control?"* - - *"How do I use `use_persistent_context` and `user_data_dir` to maintain login sessions and persistent storage?"* - -3. **Debugging and Remote Access** - - *"How do I use the `debugging_port` to remotely inspect the browser with DevTools?"* - -4. **Proxy and Network Configurations** - - *"How can I configure a `proxy` or `proxy_config` for region-specific crawling or authentication?"* - -5. **Viewports and Layout Testing** - - *"How do I adjust `viewport_width` and `viewport_height` for responsive layout testing?"* - -6. **Downloads and Storage States** - - *"What steps do I need to take to enable `accept_downloads` and specify a `downloads_path`?"* - - *"How can I use `storage_state` to preload cookies or session data?"* - -7. **HTTPS and JavaScript Settings** - - *"What happens if I set `ignore_https_errors=True` on sites with invalid SSL certificates?"* - - *"When should I disable `java_script_enabled` to improve speed and stability?"* - -8. **Cookies, Headers, and User Agents** - - *"How do I add custom `cookies` or `headers` to every browser request?"* - - *"How can I set a custom `user_agent` or use a `user_agent_mode` like `random` to avoid detection?"* - -9. **Performance Tuning** - - *"What is the difference between `text_mode`, `light_mode`, and adding `extra_args` for performance tuning?"* - ---- - -**CrawlerRunConfig:** - -10. **Content Extraction and Filtering** - - *"How does the `word_count_threshold` affect which pages or sections get processed?"* - - *"What `extraction_strategy` should I use for structured data extraction and how does `chunking_strategy` help organize the content?"* - - *"How do I apply a `css_selector` or `excluded_tags` to refine my extracted content?"* - -11. **Markdown and Text-Only Modes** - - *"Can I generate Markdown output directly and what `markdown_generator` should I use?"* - - *"When should I set `only_text=True` to strip out non-textual content?"* - -12. **Caching and Session Handling** - - *"How does `cache_mode=ENABLED` improve performance, and when should I consider `WRITE_ONLY` or disabling the cache?"* - - *"What is the role of `session_id` in maintaining state across requests?"* - -13. **Page Loading and Timing** - - *"How do `wait_until`, `page_timeout`, and `wait_for` elements help control page load timing before extraction?"* - - *"When should I disable `wait_for_images` to speed up the crawl?"* - -14. **Delays and Concurrency** - - *"How do `mean_delay` and `max_range` randomize request intervals to avoid detection?"* - - *"What is `semaphore_count` and how does it manage concurrency for multiple crawling tasks?"* - -15. **JavaScript Execution and Dynamic Content** - - *"How can I inject custom `js_code` to load additional data or simulate user interactions?"* - - *"When should I use `scan_full_page` or `adjust_viewport_to_content` to handle infinite scrolling?"* - -16. **Screenshots, PDFs, and Media** - - *"How do I enable `screenshot` or `pdf` generation to capture page states?"* - - *"What are `image_description_min_word_threshold` and `image_score_threshold` for, and how do they enhance image-related extraction?"* - -17. **Logging and Debugging** - - *"How do `verbose` and `log_console` help me troubleshoot issues with crawling or page scripts?"* - ---- - -### Topics Discussed in the File - -- **BrowserConfig Essentials:** - - Browser types (`chromium`, `firefox`, `webkit`) - - Headless vs. non-headless mode - - Persistent context and managed browser sessions - - Proxy configurations and network settings - - Viewport dimensions and responsive testing - - Download handling and storage states - - HTTPS errors and JavaScript enablement - - Cookies, headers, and user agents - - Performance tuning via `text_mode`, `light_mode`, and `extra_args` - -- **CrawlerRunConfig Core Settings:** - - Content extraction parameters (`word_count_threshold`, `extraction_strategy`, `chunking_strategy`) - - Markdown generation and text-only extraction - - Content filtering (`css_selector`, `excluded_tags`) - - Caching strategies and `cache_mode` options - - Page load conditions (`wait_until`, `wait_for`) and timeouts (`page_timeout`) - - Delays, concurrency, and scaling (`mean_delay`, `max_range`, `semaphore_count`) - - JavaScript injections (`js_code`) and handling dynamic/infinite scroll content - - Screenshots, PDFs, and image thresholds for enhanced outputs - - Logging and debugging modes (`verbose`, `log_console`) \ No newline at end of file +browser_config: Configure browser type with chromium, firefox, or webkit support | browser selection, browser engine, web engine | BrowserConfig(browser_type="chromium") +headless_mode: Toggle headless browser mode for GUI-less operation | headless browser, no GUI, background mode | BrowserConfig(headless=True) +managed_browser: Enable advanced browser manipulation and control | browser management, session control | BrowserConfig(use_managed_browser=True) +debugging_setup: Configure remote debugging port for browser inspection | debug port, devtools connection | BrowserConfig(debugging_port=9222) +persistent_context: Enable persistent browser sessions for maintaining state | session persistence, profile saving | BrowserConfig(use_persistent_context=True) +browser_profile: Specify directory for storing browser profile data | user data, profile storage | BrowserConfig(user_data_dir="/path/to/profile") +proxy_configuration: Set up proxy settings for browser connections | proxy server, network routing | BrowserConfig(proxy="http://proxy.example.com:8080") +viewport_settings: Configure browser window dimensions | screen size, window dimensions | BrowserConfig(viewport_width=1920, viewport_height=1080) +download_handling: Configure browser download behavior and location | file downloads, download directory | BrowserConfig(accept_downloads=True, downloads_path="/downloads") +content_threshold: Set minimum word count for processing page content | word limit, content filter | CrawlerRunConfig(word_count_threshold=200) +extraction_strategy: Configure method for extracting structured data | data extraction, parsing strategy | CrawlerRunConfig(extraction_strategy=CustomStrategy()) +content_chunking: Define strategy for breaking content into chunks | text chunking, content splitting | CrawlerRunConfig(chunking_strategy=RegexChunking()) +cache_behavior: Control caching mode for crawler operations | cache control, data caching | CrawlerRunConfig(cache_mode=CacheMode.ENABLED) +page_navigation: Configure page load and navigation timing | page timeout, navigation wait | CrawlerRunConfig(wait_until="domcontentloaded", page_timeout=60000) +javascript_execution: Enable or disable JavaScript processing | JS handling, script execution | CrawlerRunConfig(java_script_enabled=True) +content_filtering: Configure HTML tag exclusion and content cleanup | tag filtering, content cleanup | CrawlerRunConfig(excluded_tags=["script", "style"]) +concurrent_operations: Set limit for simultaneous crawler operations | concurrency control, parallel crawling | CrawlerRunConfig(semaphore_count=5) +page_interaction: Configure JavaScript execution and page scanning | page automation, interaction control | CrawlerRunConfig(js_code="custom_script()", scan_full_page=True) +media_capture: Enable screenshot and PDF generation capabilities | visual capture, page export | CrawlerRunConfig(screenshot=True, pdf=True) +debugging_options: Configure logging and console message capture | debug logging, error tracking | CrawlerRunConfig(verbose=True, log_console=True) \ No newline at end of file diff --git a/docs/llm.txt/3_async_webcrawler.ex.md b/docs/llm.txt/3_async_webcrawler.md similarity index 100% rename from docs/llm.txt/3_async_webcrawler.ex.md rename to docs/llm.txt/3_async_webcrawler.md diff --git a/docs/llm.txt/3_async_webcrawler.q.md b/docs/llm.txt/3_async_webcrawler.q.md index 074d9e9f..e3993566 100644 --- a/docs/llm.txt/3_async_webcrawler.q.md +++ b/docs/llm.txt/3_async_webcrawler.q.md @@ -1,81 +1,15 @@ -### Questions - -1. **Asynchronous Crawling Basics** - - *"How do I perform asynchronous web crawling using `AsyncWebCrawler`?"* - - *"What are the performance benefits of asynchronous I/O in `crawl4ai`?"* - -2. **Browser Configuration** - - *"How can I configure `BrowserConfig` for headless Chromium or Firefox?"* - - *"How do I set viewport dimensions and proxies in the `BrowserConfig`?"* - - *"How can I enable verbose logging for browser interactions?"* - -3. **Docker and Containerization** - - *"How do I run `AsyncWebCrawler` inside a Docker container for scalability?"* - - *"Which dependencies are needed in the Dockerfile to run asynchronous crawls?"* - -4. **Crawling Strategies** - - *"What is `AsyncPlaywrightCrawlerStrategy` and when should I use it?"* - - *"How do I switch between different crawler strategies if multiple are available?"* - -5. **Handling Dynamic Content** - - *"How can I inject custom JavaScript to load more content or simulate user actions?"* - - *"What is the best way to wait for specific DOM elements before extracting content?"* - -6. **Extraction Strategies** - - *"How do I use `JsonCssExtractionStrategy` to extract structured JSON data?"* - - *"What are the differences between regex-based chunking and NLP-based chunking?"* - - *"How can I integrate `LLMExtractionStrategy` for more intelligent data extraction?"* - -7. **Caching and Performance** - - *"How does caching improve the performance of asynchronous crawling?"* - - *"How do I clear or bypass the cache in `AsyncWebCrawler`?"* - - *"What are the available `CacheMode` options and when should I use each?"* - -8. **Batch Crawling and Concurrency** - - *"How do I crawl multiple URLs concurrently using `arun_many`?"* - - *"How can I limit concurrency with `semaphore_count` for resource management?"* - -9. **Scaling Crawls** - - *"What strategies can I use to scale asynchronous crawls across multiple machines?"* - - *"How do I integrate job queues or distribute tasks for larger crawl projects?"* - -10. **Screenshots and PDFs** - - *"How do I enable screenshot or PDF capture during a crawl?"* - - *"How can I save visual outputs for troubleshooting rendering issues?"* - -11. **Troubleshooting** - - *"What should I do if the browser fails to launch or times out?"* - - *"How do I debug JavaScript code injections that don’t work as expected?"* - - *"How can I handle partial loads or missing content due to timeouts?"* - -12. **Best Practices** - - *"How do I handle authentication or session management in `AsyncWebCrawler`?"* - - *"How can I avoid getting blocked by target sites, e.g., by using proxies?"* - - *"What error handling approaches are recommended for production crawls?"* - - *"How can I adhere to legal and ethical guidelines when crawling?"* - -13. **Configuration Options** - - *"How do I customize `CrawlerRunConfig` parameters like `mean_delay` and `max_range`?"* - - *"How can I run the crawler non-headless for debugging dynamic interactions?"* - -14. **Integration and Reference** - - *"Where can I find the GitHub repository or additional documentation?"* - - *"How do I incorporate Playwright’s advanced features with `AsyncWebCrawler`?"* - -### Topics Discussed in the File - -- **Asynchronous Crawling and Performance** -- **`AsyncWebCrawler` Initialization and Usage** -- **`BrowserConfig` for Browser Choice, Headless Mode, Viewport, Proxy, and Verbosity** -- **Running Crawlers in Docker and Containerized Environments** -- **`AsyncPlaywrightCrawlerStrategy` and DOM Interactions** -- **Dynamic Content Handling via JavaScript Injection** -- **Extraction Strategies (e.g., `JsonCssExtractionStrategy`, `LLMExtractionStrategy`)** -- **Content Chunking Approaches (Regex and NLP-based)** -- **Caching Mechanisms and Cache Modes** -- **Parallel Crawling with `arun_many` and Concurrency Controls** -- **Scaling Crawls Across Multiple Workers or Containers** -- **Screenshot and PDF Generation for Debugging** -- **Common Troubleshooting Techniques and Error Handling** -- **Authentication, Session Management, and Ethical Guidelines** -- **Adjusting `CrawlerRunConfig` for Delays, Concurrency, Extraction, and JavaScript Injection** \ No newline at end of file +quick_start: Basic async crawl setup requires BrowserConfig and AsyncWebCrawler initialization | getting started, basic usage, initialization | asyncio.run(AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True))) +browser_types: AsyncWebCrawler supports multiple browser types including Chromium and Firefox | supported browsers, browser options | BrowserConfig(browser_type="chromium") +headless_mode: Browser can run in headless mode without UI for better performance | invisible browser, no GUI | BrowserConfig(headless=True) +viewport_settings: Configure browser viewport dimensions for proper page rendering | screen size, window size | BrowserConfig(viewport_width=1920, viewport_height=1080) +docker_deployment: AsyncWebCrawler can run in Docker containers for scalability | containerization, deployment | FROM python:3.10-slim; RUN pip install crawl4ai playwright +dynamic_content: Handle JavaScript-loaded content using custom JS injection | javascript handling, dynamic loading | CrawlerRunConfig(js_code=["document.querySelector('.load-more').click()"]) +extraction_strategies: Multiple strategies available for content extraction including JsonCssExtractionStrategy and LLMExtractionStrategy | content extraction, data parsing | JsonCssExtractionStrategy(selectors={"title": "h1"}) +caching_modes: Control cache behavior with different modes: ENABLED, BYPASS, DISABLED | cache control, caching options | CrawlerRunConfig(cache_mode=CacheMode.ENABLED) +batch_crawling: Process multiple URLs concurrently using arun_many method | parallel crawling, multiple urls | crawler.arun_many(urls, config=CrawlerRunConfig(semaphore_count=10)) +rate_limiting: Control crawl rate using mean_delay and max_range parameters | throttling, delay control | CrawlerRunConfig(mean_delay=1.0, max_range=0.5) +visual_capture: Generate screenshots and PDFs of crawled pages | page capture, visual output | CrawlerRunConfig(screenshot=True, pdf=True) +error_handling: Common issues include browser launch failures, timeouts, and JS execution problems | troubleshooting, debugging | try/except blocks with crawler.logger +authentication: Handle login requirements through js_code or Playwright selectors | login handling, sessions | CrawlerRunConfig with login steps via js_code +proxy_configuration: Configure proxy settings to bypass IP restrictions | proxy setup, IP rotation | BrowserConfig(proxy="http://proxy-server:port") +chunking_strategies: Split content using regex or NLP-based chunking | content splitting, text processing | CrawlerRunConfig(chunking_strategy=RegexChunking()) \ No newline at end of file diff --git a/docs/llm.txt/3_async_webcrawler.xs.q.md b/docs/llm.txt/3_async_webcrawler.xs.q.md new file mode 100644 index 00000000..c037271b --- /dev/null +++ b/docs/llm.txt/3_async_webcrawler.xs.q.md @@ -0,0 +1,12 @@ +setup_usage: Initialize AsyncWebCrawler with BrowserConfig for basic web crawling | crawler setup, initialization, basic usage | AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) +browser_configuration: Configure browser settings including type, headless mode, viewport, and proxy | browser setup, browser settings, viewport config | BrowserConfig(browser_type="firefox", headless=False, viewport_width=1920) +docker_setup: Run crawler in Docker using python slim image with playwright installation | docker configuration, containerization | FROM python:3.10-slim; RUN pip install crawl4ai playwright +crawler_strategy: Use AsyncPlaywrightCrawlerStrategy as default crawler implementation | crawler implementation, strategy pattern | AsyncWebCrawler(crawler_strategy=AsyncPlaywrightCrawlerStrategy()) +dynamic_content: Execute custom JavaScript code for dynamic content loading | javascript execution, dynamic loading, interaction | CrawlerRunConfig(js_code=["document.querySelector('.load-more').click()"]) +extraction_strategies: Choose between JSON CSS, LLM, or No extraction strategies for content parsing | content extraction, parsing strategies | CrawlerRunConfig(extraction_strategy=JsonCssExtractionStrategy(selectors={"title": "h1"})) +cache_management: Control cache behavior with ENABLED, BYPASS, or DISABLED modes | caching, cache control, performance | await c.aclear_cache(), await c.aflush_cache() +parallel_crawling: Crawl multiple URLs concurrently with semaphore control | batch crawling, parallel execution | CrawlerRunConfig(semaphore_count=10) +media_capture: Capture screenshots and PDFs of crawled pages | screenshots, pdf generation, media export | CrawlerRunConfig(screenshot=True, pdf=True) +troubleshooting: Common issues include browser launch failures, timeouts, and stale cache | error handling, debugging, fixes | playwright install chromium +best_practices: Use modular crawl logic, proxies, and proper resource cleanup | optimization, maintenance, efficiency | async with AsyncWebCrawler() as c +custom_settings: Configure user agent and local file access options | customization, configuration options | user_agent="MyUserAgent", file:// prefix \ No newline at end of file diff --git a/docs/llm.txt/4_browser_context_page.ex.md b/docs/llm.txt/4_browser_context_page.md similarity index 100% rename from docs/llm.txt/4_browser_context_page.ex.md rename to docs/llm.txt/4_browser_context_page.md diff --git a/docs/llm.txt/4_browser_context_page.q.md b/docs/llm.txt/4_browser_context_page.q.md index 5b4faa94..dd446986 100644 --- a/docs/llm.txt/4_browser_context_page.q.md +++ b/docs/llm.txt/4_browser_context_page.q.md @@ -1,62 +1,10 @@ -### Questions - -1. **Browser Creation and Configuration** - - *"How do I create a browser instance with `BrowserConfig` for asynchronous crawling?"* - - *"What is the difference between standard browser creation and using persistent contexts?"* - - *"How do I configure headless mode and viewport dimensions?"* - -2. **Persistent Sessions and `user_data_dir`** - - *"How do persistent contexts work with `user_data_dir` to maintain session data?"* - - *"How can I reuse cookies and local storage to avoid repetitive logins?"* - -3. **Managed Browser** - - *"What benefits does `ManagedBrowser` provide over a standard browser instance?"* - - *"How do I enable identity preservation and stealth techniques using `ManagedBrowser`?"* - - *"How can I integrate debugging tools like Chrome Developer Tools with `ManagedBrowser`?"* - -4. **Identity Preservation** - - *"How can I simulate human-like behavior (mouse movements, scrolling) to preserve identity?"* - - *"What techniques does `crawl4ai` use to bypass CAPTCHA challenges and maintain authenticity?"* - - *"How do I use real user profiles to solve CAPTCHAs and save session data?"* - -5. **Session Management** - - *"How can I maintain state across multiple crawls using `session_id`?"* - - *"What are best practices for using sessions to handle multi-step login flows?"* - - *"How do I reuse sessions for authenticated workflows and reduce overhead?"* - -6. **Dynamic Content Handling** - - *"How can I inject JavaScript or wait conditions to ensure dynamic elements load before extraction?"* - - *"What strategies can I use to navigate infinite scrolling or ‘Load More’ buttons?"* - - *"How do I integrate JS code execution and waiting to handle modern SPA (Single Page Application) layouts?"* - -7. **Scaling and Performance** - - *"How do I scale crawls to handle thousands of URLs concurrently?"* - - *"What options exist for caching and resource utilization optimization?"* - - *"How do I handle multiple browser instances efficiently for high-volume crawling?"* - -8. **Extraction Strategies** - - *"How can I use `JsonCssExtractionStrategy` to extract structured data?"* - - *"What methods are available to chunk or filter extracted content?"* - -9. **Magic Mode vs. Managed Browsers** - - *"What is Magic Mode and when should I use it over Managed Browsers?"* - - *"Does Magic Mode help with basic sites, and how do I enable it?"* - -10. **Troubleshooting and Best Practices** - - *"How can I debug browser automation issues with logs and headful mode?"* - - *"What best practices should I follow to respect website policies?"* - - *"How do I handle authentication flows, form submissions, and CAPTCHA challenges effectively?"* - -### Topics Discussed in the File - -- **Browser Instance Creation** (Standard vs. Persistent Contexts) -- **`BrowserConfig` Customization** (headless mode, viewport, proxies, debugging) -- **Managed Browser for Resource Management and Debugging** -- **Identity Preservation Techniques** (Stealth, Human-like Behavior, Bypass CAPTCHAs) -- **Persistent Sessions and `user_data_dir`** (Session Reuse, Authentication Flows) -- **Crawling Modern Web Apps** (Dynamic Content, JS Injection, Infinite Scrolling) -- **Session Management with `session_id`** (Maintaining State, Multi-Step Flows) -- **Magic Mode** (Automation of User-Like Behavior, Simple Setup) -- **Extraction Strategies** (`JsonCssExtractionStrategy`, Handling Structured Data) -- **Scaling and Performance Optimization** (Multiple URLs, Concurrency, Reusing Sessions) -- **Best Practices and Troubleshooting** (Respecting Policies, Debugging Tools, Handling Errors) \ No newline at end of file +browser_creation: Create standard browser instance with default configurations | browser initialization, basic setup, minimal config | AsyncWebCrawler(browser_config=BrowserConfig(browser_type="chromium", headless=True)) +persistent_context: Use persistent browser contexts to maintain session data and cookies | user_data_dir, session storage, login state | BrowserConfig(user_data_dir="/path/to/user/data") +managed_browser: High-level browser management with resource optimization and debugging | browser process, stealth mode, debugging tools | BrowserConfig(headless=False, debug_port=9222) +context_config: Configure browser context with custom headers and cookies | headers customization, session reuse | CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"}) +page_creation: Create and customize browser pages with viewport settings | viewport size, iframe handling, lazy loading | CrawlerRunConfig(viewport_width=1920, viewport_height=1080) +identity_preservation: Maintain authentic digital identity using Managed Browsers | user profiles, CAPTCHA bypass, persistent login | BrowserConfig(use_managed_browser=True, user_data_dir="/path/to/profile") +magic_mode: Enable automated user-like behavior and detection bypass | automation masking, cookie handling | crawler.arun(url="example.com", magic=True) +session_management: Maintain state across multiple requests using session IDs | session reuse, sequential crawling | CrawlerRunConfig(session_id="my_session") +dynamic_content: Handle JavaScript-rendered content with custom execution hooks | content loading, pagination | js_code="document.querySelector('a.pagination-next').click()" +best_practices: Follow recommended patterns for efficient crawling | resource management, error handling | crawler.crawler_strategy.kill_session(session_id) \ No newline at end of file diff --git a/docs/llm.txt/4_browser_context_page.sm.md b/docs/llm.txt/4_browser_context_page.xs.md similarity index 100% rename from docs/llm.txt/4_browser_context_page.sm.md rename to docs/llm.txt/4_browser_context_page.xs.md diff --git a/docs/llm.txt/5_markdown_generation.ex.md b/docs/llm.txt/5_markdown_generation.md similarity index 96% rename from docs/llm.txt/5_markdown_generation.ex.md rename to docs/llm.txt/5_markdown_generation.md index 86c89f55..235e9f83 100644 --- a/docs/llm.txt/5_markdown_generation.ex.md +++ b/docs/llm.txt/5_markdown_generation.md @@ -96,6 +96,30 @@ async with AsyncWebCrawler() as crawler: In this example, we ignore all hyperlinks, do not escape HTML entities, wrap text at 80 characters wide, skip internal links, mark code regions, and include superscript/subscript formatting. +### Using Content Filters + +When you need filtered markdown (fit_markdown), configure the content filter with the markdown generator: + +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai import CrawlerRunConfig + +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(), # Content filter goes here + options={ + "ignore_links": True, + "escape_html": False + } + ) +) +``` + +This setup enables: +- Raw markdown generation (always available) +- Filtered markdown (fit_markdown) through PruningContentFilter + ### Using Content Filters in Markdown Generation - **`content_filter` (object):** diff --git a/docs/llm.txt/5_markdown_generation.q.md b/docs/llm.txt/5_markdown_generation.q.md index cbd3bdb6..92d1ea40 100644 --- a/docs/llm.txt/5_markdown_generation.q.md +++ b/docs/llm.txt/5_markdown_generation.q.md @@ -1,53 +1,15 @@ -### Hypothetical Questions - -1. **Markdown Generation Basics** - - *"How can I convert raw HTML into clean, structured Markdown using Crawl4AI?"* - - *"What are the main benefits of generating Markdown from web content for LLM workflows?"* - - *"How do I quickly start generating Markdown output from a given URL?"* - -2. **Default Markdown Generator Configuration** - - *"What parameters can I customize in `DefaultMarkdownGenerator` to control the HTML-to-Markdown conversion?"* - - *"How do I ignore links, images, or HTML entities when converting to Markdown?"* - - *"Can I set a custom line-wrapping width and handle code blocks in Markdown output?"* - -3. **Content Filtering Strategies** - - *"How can I apply filters like BM25 or pruning before Markdown generation?"* - - *"What is `fit_markdown` and how does it differ from the raw Markdown output?"* - - *"How do I use `BM25ContentFilter` to get content relevant to a specific user query?"* - - *"What does `PruningContentFilter` do, and when should I use it to clean up noisy HTML?"* - -4. **BM25 and Pruning Filters** - - *"How does BM25 ranking improve the relevance of extracted Markdown content?"* - - *"Which parameters should I tweak if BM25 returns too much or too little content?"* - - *"How can I combine `PruningContentFilter` with BM25 to first remove boilerplate and then focus on relevance?"* - -5. **Advanced html2text Configuration** - - *"What advanced `html2text` options are available and how do I set them?"* - - *"How can I preserve specific tags, handle code blocks, or skip internal links?"* - - *"Can I handle superscript and subscript formatting in the Markdown output?"* - -6. **Troubleshooting and Best Practices** - - *"Why am I getting empty Markdown output and how can I fix it?"* - - *"How do I handle malformed HTML or JavaScript-heavy sites?"* - - *"What are the recommended workflows for large-scale or performance-critical Markdown generation?"* - - *"How do I preserve references or add citation-style links in the final Markdown?"* - -7. **Use Cases and Integration** - - *"How can I incorporate `fit_markdown` into an LLM fine-tuning or RAG pipeline?"* - - *"Can I run Crawl4AI’s Markdown generation inside a Docker container for consistent environments?"* - - *"How do I cache results or reuse sessions to speed up repeated markdown generation tasks?"* - -### Topics Discussed in the File - -- **Markdown Generation Workflow** using `DefaultMarkdownGenerator` -- **HTML-to-Markdown Conversion Options** (ignore links, images, escape HTML, line-wrapping, code handling) -- **Applying Content Filters** (BM25 and Pruning) before Markdown generation -- **fit_markdown vs. raw_markdown** for filtered, cleaner output -- **BM25ContentFilter** for query-based content relevance -- **PruningContentFilter** for unsupervised noise removal and cleaner pages -- **Combining Filters** (prune first, then BM25) to refine content -- **Advanced `html2text` Configurations** (handle code blocks, superscripts, skip internal links) -- **Troubleshooting Tips** (empty output, malformed HTML, performance considerations) -- **Downstream Uses**: Training LLMs, building RAG pipelines, semantic search indexing -- **Best Practices** (iterative parameter tuning, caching, Docker deployment) -- **Real-World Scenarios** (news summarization, large corpus pre-processing, improved RAG retrieval quality) \ No newline at end of file +markdown_generation: Converts web content into clean, structured Markdown format for AI processing | html to markdown, text conversion, content extraction | DefaultMarkdownGenerator() +markdown_config_options: Configure HTML to Markdown conversion with html2text options like ignore_links, escape_html, body_width | markdown settings, conversion options | html2text_config={"ignore_links": True, "body_width": 80} +content_filtering: Filter and clean web content using BM25 or Pruning strategies | content cleanup, noise removal | content_filter=BM25ContentFilter() +bm25_filtering: Score and filter content based on relevance to a user query | relevance filtering, query matching | BM25ContentFilter(user_query="ai", bm25_threshold=1.5) +pruning_filter: Remove boilerplate and noise using unsupervised clustering approach | content pruning, noise removal | PruningContentFilter(threshold=0.7, threshold_type="dynamic") +markdown_result_types: Access different markdown outputs including raw, cited, and filtered versions | markdown formats, output types | result.markdown_v2.{raw_markdown, markdown_with_citations, fit_markdown} +link_citations: Convert webpage links into citation-style references at document end | reference handling, link management | markdown_with_citations output format +content_scoring: Evaluate content blocks based on text density, link density, and tag importance | content metrics, scoring system | PruningContentFilter metrics +combined_filtering: Apply both pruning and BM25 filters for optimal content extraction | filter pipeline, multi-stage filtering | PruningContentFilter() followed by BM25ContentFilter() +markdown_generation_troubleshooting: Debug empty outputs and malformed content issues | error handling, debugging | Check HTML content and filter thresholds +performance_optimization: Cache results and adjust parameters for better processing speed | optimization, caching | Store intermediate results for reuse +rag_pipeline_integration: Use filtered markdown for retrieval-augmented generation systems | RAG, vector storage | Store fit_markdown in vector database +code_block_handling: Preserve and format code snippets in markdown output | code formatting, syntax | handle_code_in_pre=True option +authentication_handling: Process content from authenticated pages using session tokens | auth support, protected content | Provide session tokens before markdown generation +docker_deployment: Run markdown generation in containerized environment | deployment, containers | Include in Dockerfile configuration \ No newline at end of file diff --git a/docs/llm.txt/5_markdown_generation.sm.md b/docs/llm.txt/5_markdown_generation.xs.md similarity index 100% rename from docs/llm.txt/5_markdown_generation.sm.md rename to docs/llm.txt/5_markdown_generation.xs.md diff --git a/docs/llm.txt/6_chunking_strategies.q.md b/docs/llm.txt/6_chunking_strategies.q.md index ff3381c1..8a88b830 100644 --- a/docs/llm.txt/6_chunking_strategies.q.md +++ b/docs/llm.txt/6_chunking_strategies.q.md @@ -1,53 +1,10 @@ -### Hypothetical Questions - -1. **General Purpose of Chunking** - - *"Why is chunking text important before applying cosine similarity or building RAG pipelines?"* - - *"How does dividing large texts into smaller chunks improve retrieval accuracy and scalability?"* - -2. **Regex-Based Chunking** - - *"How can I split text into chunks using a custom regular expression?"* - - *"What are typical use cases for Regex-based chunking, and when should I prefer it over other methods?"* - -3. **Sentence-Based Chunking** - - *"How do I break text into individual sentences using an NLP approach like `sent_tokenize`?"* - - *"When should I prefer sentence-based chunking over regex-based or fixed-length chunking?"* - -4. **Topic-Based Segmentation** - - *"What is topic-based segmentation, and how does it produce thematically coherent chunks?"* - - *"How can I integrate TextTiling or other topic segmentation algorithms into my chunking pipeline?"* - -5. **Fixed-Length Word Chunking** - - *"How do I evenly distribute text into fixed-size word chunks?"* - - *"What are the benefits and drawbacks of using a fixed-length chunking strategy?"* - -6. **Sliding Window Chunking** - - *"What is a sliding window approach, and how does overlapping chunks improve context retention?"* - - *"How do I choose appropriate window sizes and step values for my sliding window chunking?"* - -7. **Cosine Similarity Integration** - - *"How do I apply cosine similarity to identify the most relevant chunks for a given query?"* - - *"What preprocessing steps are necessary before computing cosine similarity between a query and the generated chunks?"* - -8. **RAG (Retrieval-Augmented Generation) Applications** - - *"How can chunking strategies facilitate integration with Retrieval-Augmented Generation systems?"* - - *"Which chunking method is best suited for maintaining context in RAG-based pipelines?"* - -9. **Practical Considerations & Best Practices** - - *"How do I choose the right chunking strategy for my specific use case (e.g., documents, transcripts, webpages)?"* - - *"What are some best practices for combining chunking, vectorization, and similarity scoring methods?"* - -10. **Advanced Use Cases** - - *"Can I combine multiple chunking strategies, such as applying sentence tokenization followed by a sliding window?"* - - *"How do I handle very large documents or corpora with chunking and similarity extraction at scale?"* - -### Topics Discussed in the File - -- **Purpose of Chunking Strategies**: Facilitating cosine similarity retrieval and RAG system integration. -- **Regex-Based Chunking**: Splitting text based on patterns (e.g., paragraphs, blank lines). -- **Sentence-Based Chunking**: Using NLP techniques to create sentence-level segments for fine-grained analysis. -- **Topic-Based Segmentation**: Grouping text into topical units for thematic coherence. -- **Fixed-Length Word Chunking**: Dividing text into uniform word count segments for consistent structure. -- **Sliding Window Chunking**: Overlapping segments to preserve contextual continuity. -- **Integrating Cosine Similarity**: Pairing chunked text with a query to retrieve the most relevant content. -- **Applications in RAG Systems**: Enhancing retrieval workflows by organizing content into meaningful chunks. -- **Comparison of Chunking Methods**: Trade-offs between simplicity, coherence, and context preservation. \ No newline at end of file +chunking_overview: Chunking strategies divide large texts into manageable parts for content processing and extraction | text segmentation, content division, document splitting | None +cosine_similarity_integration: Chunking prepares text segments for semantic similarity analysis using cosine similarity | semantic search, relevance matching | from sklearn.metrics.pairwise import cosine_similarity +rag_integration: Chunks can be integrated into RAG (Retrieval-Augmented Generation) systems for structured workflows | retrieval augmented generation, RAG pipeline | None +regex_chunking: Split text using regular expression patterns for basic segmentation | regex splitting, pattern-based chunking | RegexChunking(patterns=[r'\n\n']) +sentence_chunking: Divide text into individual sentences using NLP tools | sentence tokenization, NLP chunking | from nltk.tokenize import sent_tokenize +topic_chunking: Create topic-coherent chunks using TextTiling algorithm | topic segmentation, TextTiling | from nltk.tokenize import TextTilingTokenizer +fixed_length_chunking: Segment text into chunks with fixed word count | word-based chunking, fixed size segments | FixedLengthWordChunking(chunk_size=100) +sliding_window_chunking: Generate overlapping chunks for better context preservation | overlapping segments, windowed chunking | SlidingWindowChunking(window_size=100, step=50) +cosine_similarity_extraction: Extract relevant chunks using TF-IDF and cosine similarity comparison | similarity search, relevance extraction | from sklearn.feature_extraction.text import TfidfVectorizer +chunking_workflow: Combine chunking with cosine similarity for enhanced content retrieval | content extraction, similarity workflow | CosineSimilarityExtractor(query).find_relevant_chunks(chunks) \ No newline at end of file diff --git a/docs/llm.txt/7_extraction_strategies.ex.md b/docs/llm.txt/7_extraction_strategies.md similarity index 95% rename from docs/llm.txt/7_extraction_strategies.ex.md rename to docs/llm.txt/7_extraction_strategies.md index 7ef6a125..338787a6 100644 --- a/docs/llm.txt/7_extraction_strategies.ex.md +++ b/docs/llm.txt/7_extraction_strategies.md @@ -3,6 +3,33 @@ ## Extraction Strategies Structured data extraction strategies are designed to convert raw web content into organized, JSON-formatted data. These strategies handle diverse extraction scenarios, including schema-based, language model-driven, and clustering methods. This section covers models using LLMs or without using them to extract data with precision and flexibility. +## Input Formats +All extraction strategies support different input formats to give you more control over how content is processed: + +- **markdown** (default): Uses the raw markdown conversion of the HTML content. Best for general text extraction where HTML structure isn't critical. +- **html**: Uses the raw HTML content. Useful when you need to preserve HTML structure or extract data from specific HTML elements. +- **fit_markdown**: Uses the cleaned and filtered markdown content. Best for extracting relevant content while removing noise. Requires a markdown generator with content filter to be configured. + +To specify an input format: +```python +strategy = LLMExtractionStrategy( + input_format="html", # or "markdown" or "fit_markdown" + provider="openai/gpt-4", + instruction="Extract product information" +) +``` + +Note: When using "fit_markdown", ensure your CrawlerRunConfig includes a markdown generator and content filter: +```python +config = CrawlerRunConfig( + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator(), + content_filter=PruningContentFilter() +) +``` + +If fit_markdown is requested but not available (no markdown generator or content filter), the system will automatically fall back to raw markdown with a warning. + ### LLM Extraction Strategy The **LLM Extraction Strategy** employs a large language model (LLM) to process content dynamically. It supports: - **Schema-Based Extraction**: Using a defined JSON schema to structure output. diff --git a/docs/llm.txt/7_extraction_strategies.q.md b/docs/llm.txt/7_extraction_strategies.q.md index c4e52f14..a323e423 100644 --- a/docs/llm.txt/7_extraction_strategies.q.md +++ b/docs/llm.txt/7_extraction_strategies.q.md @@ -1,74 +1,12 @@ -### Hypothetical Questions - -1. **LLM Extraction Strategy** - - *"How can I use an LLM to dynamically extract structured data from a webpage?"* - - *"What is the difference between block extraction and schema-based extraction in the LLM strategy?"* - - *"How can I define a JSON schema and incorporate it into the LLM extraction process?"* - - *"What parameters control chunk size and overlap for LLM-based extraction?"* - - *"How do I handle errors, retries, and backoff when calling an LLM API for extraction?"* - -2. **Cosine Strategy** - - *"How does the Cosine Strategy identify and cluster semantically similar content?"* - - *"What parameters (like `sim_threshold` or `word_count_threshold`) affect the relevance of extracted content?"* - - *"When should I use semantic filtering with Cosine Strategy vs. simple keyword filtering?"* - - *"How can I adjust `top_k` to retrieve more or fewer relevant content clusters?"* - - *"In what scenarios is the Cosine Strategy more effective than LLM-based or CSS/XPath extraction?"* - -3. **JSON-Based Extraction Strategies (Without LLMs)** - - *"What are the advantages of using JSON-based extraction strategies like `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy` over LLM-based methods?"* - - *"How do CSS and XPath selectors differ, and when is XPath more reliable?"* - - *"How can I handle frequently changing class names or dynamic elements using XPath-based extraction?"* - - *"Can I run these extraction strategies offline without any external API calls?"* - - *"How do I combine JS execution with XPath extraction to handle dynamically loaded content?"* - -4. **Environmental and Efficiency Considerations** - - *"Why should I avoid continuous LLM calls for repetitive extraction tasks?"* - - *"How does using XPath extraction reduce energy consumption and costs?"* - - *"Can I initially use an LLM to generate a schema and then rely solely on efficient, local strategies?"* - -5. **Schema Generation with a One-Time LLM Utility** - - *"How can I use a one-time LLM call to generate a schema, then run extraction repeatedly without further LLM costs?"* - - *"What steps are involved in using a language model just once to bootstrap my extraction schema?"* - - *"How do I incorporate the generated schema into `JsonXPathExtractionStrategy` for fast, robust extraction?"* - -6. **Advanced Use Cases and Best Practices** - - *"When should I combine LLM-based extraction with cosine similarity filtering for maximum relevance?"* - - *"What best practices should I follow when choosing thresholds and selectors to ensure stable, scalable extractions?"* - - *"How can I adapt these strategies to different page layouts, content types, or query requirements?"* - - *"Are there recommended troubleshooting steps if extraction fails or yields empty results?"* - -### Topics Discussed in the File - -- **LLM Extraction Strategy**: - - **Modes**: Block-based or schema-based extraction using LLMs - - **Parameters**: API tokens, instructions, schemas, chunk sizes, overlap rates - - **Workflows**: Chunk merging, error handling, parallel execution - - **Advantages**: Dynamic adaptability, schema-based extraction, scaling large content - -- **Cosine Strategy**: - - **Approach**: Semantic filtering and clustering of content - - **Parameters**: `semantic_filter`, `word_count_threshold`, `sim_threshold`, `top_k` - - **Use Cases**: Extracting relevant content from unstructured pages based on semantic similarity - - **Advanced Config**: Custom clustering methods, model choices, performance optimization - -- **JSON-Based Extraction Strategies (Non-LLM)**: - - **Strategies**: `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy` - - **Advantages**: Speed, efficiency, no external dependencies, environmentally friendly - - **XPath vs. CSS**: XPath recommended for unstable, dynamic front-ends; more robust and structural - - **Dynamic Content**: Combine JS execution and waiting conditions with XPath extraction - -- **Sustainability and Efficiency Considerations**: - - **Rationale**: Avoiding continuous LLM use to save cost, reduce latency, and decrease carbon footprint - - **Scalability**: Run on any device without expensive hardware or API calls - -- **One-Time LLM-Assisted Schema Generation**: - - **Workflow**: Use LLM once to generate a schema from HTML and queries - - **Afterwards**: Rely solely on JSON-based extraction (CSS/XPath) for fast and stable extractions - - **Benefits**: Time-saving, cost-reducing, sustainable approach without sacrificing complexity - -- **Integration and Best Practices**: - - **Threshold Tuning**: Iterative adjustments for `sim_threshold`, `word_count_threshold` - - **Performance**: Chunking large content for LLM extraction, vectorizing content for cosine similarity - - **Testing and Validation**: Use developer tools or dummy HTML to refine selectors, test JS code for dynamic content loading - -Overall, the file emphasizes choosing the right extraction strategy for the task—ranging from highly dynamic and schema-driven LLM approaches to more stable, efficient, and environmentally friendly direct HTML parsing methods (CSS/XPath). It also suggests a hybrid approach where an LLM can be used initially to generate a schema, then rely on local extraction strategies for ongoing tasks. \ No newline at end of file +llm_extraction: LLM Extraction Strategy uses language models to process web content into structured JSON | language model extraction, schema extraction, LLM parsing | LLMExtractionStrategy(provider="openai", api_token="token") +schema_based_extraction: Extract data using predefined JSON schemas to structure LLM output | schema extraction, structured output | schema=OpenAIModelFee.model_json_schema() +chunking_config: Configure content chunking with token threshold and overlap rate | content chunks, token limits | chunk_token_threshold=1000, overlap_rate=0.1 +provider_config: Specify LLM provider and API credentials for extraction | model provider, API setup | provider="openai", api_token="your_token" +cosine_strategy: Use similarity-based clustering to extract relevant content sections | content clustering, semantic similarity | CosineStrategy(semantic_filter="product reviews") +clustering_params: Configure clustering behavior with similarity thresholds and methods | similarity settings, cluster config | sim_threshold=0.3, linkage_method='ward' +content_filtering: Filter extracted content based on word count and relevance | content filters, extraction rules | word_count_threshold=10, top_k=3 +xpath_extraction: Extract data using XPath selectors for stable structural parsing | xpath selectors, HTML parsing | JsonXPathExtractionStrategy(schema) +css_extraction: Extract data using CSS selectors for simple HTML parsing | css selectors, HTML parsing | JsonCssExtractionStrategy(schema) +schema_generation: Generate extraction schemas automatically using one-time LLM assistance | schema creation, automation | generate_schema(html, query) +dynamic_content: Handle dynamic webpage content with JavaScript execution and waiting | async content, js execution | js_code="window.scrollTo(0, document.body.scrollHeight)" +extraction_best_practices: Use XPath for stability, avoid unnecessary LLM calls, test selectors | optimization, reliability | baseSelector="//table/tbody/tr" \ No newline at end of file diff --git a/docs/llm.txt/7_extraction_strategies.sm.md b/docs/llm.txt/7_extraction_strategies.xs.md similarity index 77% rename from docs/llm.txt/7_extraction_strategies.sm.md rename to docs/llm.txt/7_extraction_strategies.xs.md index 847f07e8..ff209895 100644 --- a/docs/llm.txt/7_extraction_strategies.sm.md +++ b/docs/llm.txt/7_extraction_strategies.xs.md @@ -4,10 +4,30 @@ Streamlined parameters, usage, and code snippets for quick LLM reference. +## Input Formats + +- **markdown** (default): Raw markdown from HTML +- **html**: Raw HTML content +- **fit_markdown**: Cleaned markdown (needs markdown_generator + content_filter) + +```python +strategy = LLMExtractionStrategy( + input_format="html", # Choose format + provider="openai/gpt-4", + instruction="Extract data" +) + +config = CrawlerRunConfig( + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator(), # For fit_markdown + content_filter=PruningContentFilter() # For fit_markdown +) +``` + ## LLMExtractionStrategy - Uses LLM to extract structured data from HTML. -- Supports `instruction`, `schema`, `extraction_type`, `chunk_token_threshold`, `overlap_rate`. +- Supports `instruction`, `schema`, `extraction_type`, `chunk_token_threshold`, `overlap_rate`, `input_format`. ```python from crawl4ai.extraction_strategy import LLMExtractionStrategy strategy = LLMExtractionStrategy( @@ -15,7 +35,8 @@ strategy = LLMExtractionStrategy( api_token="your_api_token", instruction="Extract prices", schema={"fields": [...]}, - extraction_type="schema" + extraction_type="schema", + input_format="html" ) ``` diff --git a/docs/llm.txt/8_content_selection.ex.md b/docs/llm.txt/8_content_selection.md similarity index 100% rename from docs/llm.txt/8_content_selection.ex.md rename to docs/llm.txt/8_content_selection.md diff --git a/docs/llm.txt/8_content_selection.q.md b/docs/llm.txt/8_content_selection.q.md index a15d1240..20594f48 100644 --- a/docs/llm.txt/8_content_selection.q.md +++ b/docs/llm.txt/8_content_selection.q.md @@ -1,75 +1,12 @@ -### Hypothetical Questions - -1. **Basic Content Selection** - - *"How can I use a CSS selector to extract only the main article text from a webpage?"* - - *"What’s a quick way to isolate a specific element or section of a webpage using Crawl4AI?"* - -2. **Advanced CSS Selectors** - - *"How do I find the right CSS selector for a given element in a complex webpage?"* - - *"Can I combine multiple CSS selectors to target different parts of the page simultaneously?"* - -3. **Content Filtering** - - *"What parameters can I use to remove non-essential elements like headers, footers, or ads?"* - - *"How do I filter out short or irrelevant text blocks using `word_count_threshold`?"* - - *"Is it possible to exclude external links, images, or social media widgets to get cleaner data?"* - -4. **Iframe Content Handling** - - *"How do I enable iframe processing to extract content embedded in iframes?"* - - *"What should I do if the iframe content doesn’t load or is blocked?"* - -5. **LLM-Based Structured Extraction** - - *"When should I consider using LLM strategies for content extraction?"* - - *"How can I define a JSON schema for the LLM to produce structured, JSON-formatted outputs?"* - - *"What if the LLM returns incomplete or incorrect data—how can I refine the instructions or schema?"* - -6. **Pattern-Based Selection with JSON Strategies** - - *"How can I extract multiple items (e.g., a list of articles or products) from a page using `JsonCssExtractionStrategy`?"* - - *"What’s the best way to handle nested fields or multiple levels of data using a JSON schema?"* - -7. **Combining Multiple Techniques** - - *"How do I use CSS selectors, content filtering, and JSON-based extraction strategies together to get clean, structured data?"* - - *"Can I integrate LLM extraction for summarization alongside CSS-based extraction for raw content?"* - -8. **Troubleshooting and Best Practices** - - *"Why am I getting empty or no results from my selectors, and how can I debug it?"* - - *"What should I do if content loading is dynamic and requires waiting or JS execution?"* - - *"How can I optimize performance and reliability for large-scale or repeated crawls?"* - -9. **Performance and Reliability** - - *"How can I improve crawl speed while maintaining precision in content selection?"* - - *"What’s the benefit of using Dockerized environments for consistent and reproducible results?"* - -10. **Additional Resources and Extensions** - - *"Where can I find the source code for the Async Web Crawler and strategies?"* - - *"What advanced topics, such as caching, proxy integration, or Docker deployments, can I explore next?"* - -### Topics Discussed in the File - -- **CSS Selectors for Content Isolation**: - Identifying elements with CSS selectors, using browser dev tools, and extracting targeted sections of a webpage. - -- **Content Filtering Parameters**: - Removing unwanted tags, external links, social media elements, and enforcing minimum word counts to ensure meaningful content. - -- **Handling Iframes**: - Enabling `process_iframes` and dealing with multi-domain or overlay elements to extract embedded content. - -- **Structured Extraction with LLMs**: - Using `LLMExtractionStrategy` with schemas and instructions for complex or irregular data extraction, including JSON-based outputs. - -- **Pattern-Based Extraction Using Schemas (JsonCssExtractionStrategy)**: - Defining a JSON schema to extract lists of items (e.g., articles, products) that follow a consistent pattern, capturing nested fields and attributes. - -- **Combining Techniques**: - Integrating CSS selection, filtering, JSON schema extraction, and LLM-based transformation to get clean, structured, and context-rich results. - -- **Troubleshooting and Best Practices**: - Adjusting selectors, filters, and instructions, lowering thresholds if empty results occur, and refining LLM prompts for better data. - -- **Performance and Reliability**: - Starting with simple strategies, adding complexity as needed, and considering asynchronous crawling, caching, or Docker for large-scale operations. - -- **Additional Resources**: - Links to code repositories, instructions for Docker deployments, caching strategies, and further refinement for advanced use cases. - -In summary, the file provides comprehensive guidance on selecting and filtering content within Crawl4AI, covering everything from simple CSS-based extractions to advanced LLM-driven structured outputs, while also addressing common issues, best practices, and performance optimizations. \ No newline at end of file +content_selection: Crawl4AI allows precise selection and filtering of webpage content | web scraping, content extraction, web crawler | CrawlerRunConfig(css_selector=".main-article") +css_selectors: Target specific webpage elements using CSS selectors like .main-article or article h1 | DOM selection, HTML elements, element targeting | CrawlerRunConfig(css_selector="article h1, article .content") +media_extraction: Extract video and audio elements with metadata including source, type, and duration | multimedia content, media files | result.media["videos"], result.media["audios"] +link_analysis: Automatically categorize links into internal, external, social media, navigation, and content links | link classification, URL analysis | result.links["internal"], result.links["external"] +link_filtering: Control which links are included using exclude parameters | link exclusion, domain filtering | CrawlerRunConfig(exclude_external_links=True, exclude_social_media_links=True) +metadata_extraction: Automatically extract page metadata including title, description, keywords, and dates | page information, meta tags | result.metadata['title'], result.metadata['description'] +content_filtering: Remove unwanted elements using word count threshold and excluded tags | content cleanup, element removal | CrawlerRunConfig(word_count_threshold=10, excluded_tags=['form', 'header']) +iframe_handling: Process content within iframes by enabling iframe processing and overlay removal | embedded content, frames | CrawlerRunConfig(process_iframes=True, remove_overlay_elements=True) +llm_extraction: Use LLMs for complex content extraction with structured output | AI extraction, structured data | LLMExtractionStrategy(provider="ollama/nemotron", schema=ArticleContent.schema()) +pattern_extraction: Extract repetitive content patterns using JSON schema mapping | structured extraction, repeated elements | JsonCssExtractionStrategy(schema) +troubleshooting: Common issues include empty results, unintended content, and LLM errors | debugging, error handling | config.word_count_threshold, excluded_tags +best_practices: Start with simple selectors before advanced strategies and use caching for efficiency | optimization, performance | AsyncWebCrawler().arun(url=url, config=config) \ No newline at end of file diff --git a/docs/llm.txt/8_content_selection.sm.md b/docs/llm.txt/8_content_selection.xs.md similarity index 100% rename from docs/llm.txt/8_content_selection.sm.md rename to docs/llm.txt/8_content_selection.xs.md diff --git a/docs/llm.txt/9_cache_modes.q.md b/docs/llm.txt/9_cache_modes.q.md index 0cae0803..6759926b 100644 --- a/docs/llm.txt/9_cache_modes.q.md +++ b/docs/llm.txt/9_cache_modes.q.md @@ -1,58 +1,10 @@ -### Hypothetical Questions - -1. **General Understanding of the New Caching System** - - *"Why did Crawl4AI move from boolean cache flags to a `CacheMode` enum?"* - - *"What are the benefits of using a single `CacheMode` enum over multiple booleans?"* - -2. **CacheMode Usage** - - *"What `CacheMode` should I use if I want normal caching (both read and write)?"* - - *"How do I enable a mode that only reads from cache, or only writes to cache?"* - - *"What does `CacheMode.BYPASS` do, and how is it different from `CacheMode.DISABLED`?"* - -3. **Migrating from Old to New System** - - *"How do I translate `bypass_cache=True` to the new `CacheMode` approach?"* - - *"I used to set `disable_cache=True`; what `CacheMode` should I use now?"* - - *"If I previously used `no_cache_read=True`, how do I achieve the same effect with `CacheMode`?"* - -4. **Implementation Details** - - *"How do I specify the `CacheMode` in my crawler runs?"* - - *"Can I pass the `CacheMode` to `arun` directly, or do I need a `CrawlerRunConfig` object?"* - -5. **Suppressing Deprecation Warnings** - - *"How can I temporarily disable deprecation warnings while I migrate my code?"* - -6. **Edge Cases and Best Practices** - - *"What if I forget to update my code and still use the old flags?"* - - *"Is there a `CacheMode` for scenarios where I want to only write to cache and never read old data?"* - -7. **Examples and Code Snippets** - - *"Can I see a side-by-side comparison of old and new caching code for a given URL?"* - - *"How can I confirm that using `CacheMode.BYPASS` skips both reading and writing cache?"* - -8. **Performance and Reliability** - - *"Will switching to `CacheMode` improve my code’s readability and reduce confusion?"* - - *"Can the new caching system still handle large-scale crawling scenarios efficiently?"* - -### Topics Discussed in the File - -- **Old vs. New Caching Approach**: - Previously, multiple boolean flags (`bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`) controlled caching. Now, a single `CacheMode` enum simplifies configuration. - -- **CacheMode Enum**: - Provides clear modes: - - `ENABLED`: Normal caching (read and write) - - `DISABLED`: No caching at all - - `READ_ONLY`: Only read from cache, don’t write new data - - `WRITE_ONLY`: Only write to cache, don’t read old data - - `BYPASS`: Skip cache entirely for this operation - -- **Migration Patterns**: - A simple mapping table helps developers switch old boolean flags to the corresponding `CacheMode` value. - -- **Suppressing Deprecation Warnings**: - Temporarily disabling deprecation warnings provides a grace period to update old code. - -- **Code Examples**: - Side-by-side comparisons show how to update code from old flags to the new `CacheMode` approach. - -In summary, the file guides developers in transitioning from the old caching boolean flags to the new `CacheMode` enum, explaining the rationale, providing a mapping table, and offering code snippets to facilitate a smooth migration. \ No newline at end of file +cache_system: Crawl4AI v0.5.0 introduces CacheMode enum to replace boolean cache flags | caching system, cache control, cache configuration | CacheMode.ENABLED +cache_modes: CacheMode enum supports five states: ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, and BYPASS | cache states, caching options, cache settings | CacheMode.ENABLED, CacheMode.DISABLED, CacheMode.READ_ONLY, CacheMode.WRITE_ONLY, CacheMode.BYPASS +cache_migration_bypass: Replace bypass_cache=True with cache_mode=CacheMode.BYPASS | skip cache, bypass caching | cache_mode=CacheMode.BYPASS +cache_migration_disable: Replace disable_cache=True with cache_mode=CacheMode.DISABLED | disable caching, turn off cache | cache_mode=CacheMode.DISABLED +cache_migration_read: Replace no_cache_read=True with cache_mode=CacheMode.WRITE_ONLY | write-only cache, disable read | cache_mode=CacheMode.WRITE_ONLY +cache_migration_write: Replace no_cache_write=True with cache_mode=CacheMode.READ_ONLY | read-only cache, disable write | cache_mode=CacheMode.READ_ONLY +crawler_config: Use CrawlerRunConfig to set cache mode in AsyncWebCrawler | crawler settings, configuration object | CrawlerRunConfig(cache_mode=CacheMode.BYPASS) +deprecation_warnings: Suppress cache deprecation warnings by setting SHOW_DEPRECATION_WARNINGS to False | warning suppression, legacy support | SHOW_DEPRECATION_WARNINGS = False +async_crawler_usage: AsyncWebCrawler requires async/await syntax and supports configuration via CrawlerRunConfig | async crawler, web crawler setup | async with AsyncWebCrawler(verbose=True) as crawler +crawler_execution: Run AsyncWebCrawler using asyncio.run() in main script | crawler execution, async main | asyncio.run(main()) \ No newline at end of file diff --git a/docs/llm.txt/llmtxt.py b/docs/llm.txt/llmtxt.py deleted file mode 100644 index 325fa35b..00000000 --- a/docs/llm.txt/llmtxt.py +++ /dev/null @@ -1,187 +0,0 @@ -import os -from pathlib import Path -from rank_bm25 import BM25Okapi -import re -from typing import List, Literal - -from nltk.tokenize import word_tokenize -from nltk.corpus import stopwords -from nltk.stem import WordNetLemmatizer -import nltk - - -BASE_PATH = Path(__file__).resolve().parent - -def get_file_map() -> dict: - """Cache file mappings to avoid repeated directory scans""" - files = os.listdir(BASE_PATH) - file_map = {} - - for file in files: - if file.endswith('.md'): - # Extract number and name: "6_chunking_strategies.md" -> ("chunking_strategies", "6") - match = re.match(r'(\d+)_(.+?)(?:\.(?:ex|xs|sm|q)?\.md)?$', file) - if match: - num, name = match.groups() - if name not in file_map: - file_map[name] = num - return file_map - -def concatenate_docs(file_names: List[str], mode: Literal["extended", "condensed"]) -> str: - """Concatenate documentation files based on names and mode.""" - file_map = get_file_map() - result = [] - suffix_map = { - "extended": ".ex.md", - "condensed": [".xs.md", ".sm.md"] - } - - for name in file_names: - if name not in file_map: - continue - - num = file_map[name] - base_path = BASE_PATH - - if mode == "extended": - file_path = base_path / f"{num}_{name}{suffix_map[mode]}" - if not file_path.exists(): - file_path = base_path / f"{num}_{name}.md" - else: - file_path = None - for suffix in suffix_map["condensed"]: - temp_path = base_path / f"{num}_{name}{suffix}" - if temp_path.exists(): - file_path = temp_path - break - if not file_path: - file_path = base_path / f"{num}_{name}.md" - - if file_path.exists(): - with open(file_path, 'r', encoding='utf-8') as f: - result.append(f.read()) - - return "\n\n---\n\n".join(result) - -def extract_questions(content: str) -> List[tuple[str, str, str]]: - """ - Extract questions from Q files, returning list of (category, question, full_section). - """ - # Split into main sections (### Questions or ### Hypothetical Questions) - sections = re.split(r'^###\s+.*Questions\s*$', content, flags=re.MULTILINE)[1:] - - results = [] - for section in sections: - # Find all numbered categories (1. **Category Name**) - categories = re.split(r'^\d+\.\s+\*\*([^*]+)\*\*\s*$', section.strip(), flags=re.MULTILINE) - - # Process each category - for i in range(1, len(categories), 2): - category = categories[i].strip() - category_content = categories[i+1].strip() - - # Extract questions (lines starting with dash and wrapped in italics) - questions = re.findall(r'^\s*-\s*\*"([^"]+)"\*\s*$', category_content, flags=re.MULTILINE) - - # Add each question with its category and full context - for q in questions: - results.append((category, q, f"Category: {category}\nQuestion: {q}")) - - return results - -def preprocess_text(text: str) -> List[str]: - """Preprocess text for better semantic matching""" - # Lowercase and tokenize - tokens = word_tokenize(text.lower()) - - # Remove stopwords but keep question words - stop_words = set(stopwords.words('english')) - {'how', 'what', 'when', 'where', 'why', 'which'} - lemmatizer = WordNetLemmatizer() - - # Lemmatize but preserve original form for technical terms - tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words] - - return tokens - -def search_questions(query: str, top_k: int = 5) -> str: - """Search through Q files using BM25 ranking and return top K matches.""" - q_files = [f for f in os.listdir(BASE_PATH) if f.endswith(".q.md")] - # Prepare base path for file reading - q_files = [BASE_PATH / f for f in q_files] # Convert to full path - - documents = [] - file_contents = {} - - for file in q_files: - with open(file, 'r', encoding='utf-8') as f: - content = f.read() - questions = extract_questions(content) - for category, question, full_section in questions: - documents.append(question) - file_contents[question] = (file, category, full_section) - - if not documents: - return "No questions found in documentation." - - tokenized_docs = [preprocess_text(doc) for doc in documents] - tokenized_query = preprocess_text(query) - - bm25 = BM25Okapi(tokenized_docs) - doc_scores = bm25.get_scores(tokenized_query) - - score_threshold = max(doc_scores) * 0.4 - - # Aggregate scores by file - file_data = {} - for idx, score in enumerate(doc_scores): - if score > score_threshold: - question = documents[idx] - file, category, _ = file_contents[question] - - if file not in file_data: - file_data[file] = { - 'total_score': 0, - 'match_count': 0, - 'questions': [] - } - - file_data[file]['total_score'] += score - file_data[file]['match_count'] += 1 - file_data[file]['questions'].append({ - 'category': category, - 'question': question, - 'score': score - }) - - # Sort files by match count and total score - ranked_files = sorted( - file_data.items(), - key=lambda x: (x[1]['match_count'], x[1]['total_score']), - reverse=True - )[:top_k] - - # Format results by file - results = [] - for file, data in ranked_files: - questions_summary = "\n".join( - f"- [{q['category']}] {q['question']} (score: {q['score']:.2f})" - for q in sorted(data['questions'], key=lambda x: x['score'], reverse=True) - ) - - results.append( - f"File: {file}\n" - f"Match Count: {data['match_count']}\n" - f"Total Score: {data['total_score']:.2f}\n\n" - f"Matching Questions:\n{questions_summary}" - ) - - return "\n\n---\n\n".join(results) if results else "No relevant matches found." - -if __name__ == "__main__": - # Example 1: Concatenate docs - docs = concatenate_docs(["chunking_strategies", "content_selection"], "extended") - print("Concatenated docs:", docs[:200], "...\n") - - # Example 2: Search questions - results = search_questions("How do I execute JS script on the page?", 3) - print("Search results:", results[:200], "...") \ No newline at end of file diff --git a/docs/md_v2/extraction/overview.md b/docs/md_v2/extraction/overview.md index 53a8b87d..7c524475 100644 --- a/docs/md_v2/extraction/overview.md +++ b/docs/md_v2/extraction/overview.md @@ -169,6 +169,35 @@ llm_result = await crawler.arun( ) ``` + +## Input Formats +All extraction strategies support different input formats to give you more control over how content is processed: + +- **markdown** (default): Uses the raw markdown conversion of the HTML content. Best for general text extraction where HTML structure isn't critical. +- **html**: Uses the raw HTML content. Useful when you need to preserve HTML structure or extract data from specific HTML elements. +- **fit_markdown**: Uses the cleaned and filtered markdown content. Best for extracting relevant content while removing noise. Requires a markdown generator with content filter to be configured. + +To specify an input format: +```python +strategy = LLMExtractionStrategy( + input_format="html", # or "markdown" or "fit_markdown" + provider="openai/gpt-4", + instruction="Extract product information" +) +``` + +Note: When using "fit_markdown", ensure your CrawlerRunConfig includes a markdown generator with content filter: +```python +config = CrawlerRunConfig( + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() # Content filter goes here for fit_markdown + ) +) +``` + +If fit_markdown is requested but not available (no markdown generator or content filter), the system will automatically fall back to raw markdown with a warning. + ## Best Practices 1. **Choose the Right Strategy** diff --git a/examples/save_certificate.py b/examples/save_certificate.py new file mode 100644 index 00000000..606289fc --- /dev/null +++ b/examples/save_certificate.py @@ -0,0 +1,49 @@ +"""Example script showing how to save SSL certificates.""" + +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.utilities.cert_exporter import CertificateExporter + +# Get location of parent folder, then "tmp" folder if not make it +import os +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +os.makedirs(os.path.join(parent_dir, "tmp"), exist_ok=True) +__tmp_dir__ = os.path.join(parent_dir, "tmp") + +async def main(): + # Configure crawler to fetch SSL certificate + crawl_config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url='https://example.com', + config=crawl_config + ) + + if result.success and result.ssl_certificate: + # 1. Save as JSON (most readable format) + CertificateExporter.to_json( + result.ssl_certificate, + filepath=os.path.join(__tmp_dir__, "certificate.json") + ) + print("Certificate saved in JSON format: certificate.json") + + # 2. Save as PEM (standard format for web servers) + pem_data = CertificateExporter.to_pem( + result.ssl_certificate, + filepath=os.path.join(__tmp_dir__, "certificate.pem") + ) + print("Certificate saved in PEM format: certificate.pem") + + # Print basic certificate info + cert = result.ssl_certificate + print("\nCertificate Information:") + print(f"Issuer: {cert['issuer'].get(b'CN', '').decode()}") + print(f"Valid until: {cert['not_after']}") + print(f"Fingerprint: {cert['fingerprint']}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/ssl_certificate_example.py b/examples/ssl_certificate_example.py new file mode 100644 index 00000000..6e014b8e --- /dev/null +++ b/examples/ssl_certificate_example.py @@ -0,0 +1,67 @@ +"""Example script demonstrating SSL certificate retrieval and export.""" + +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.utilities.cert_exporter import CertificateExporter + +async def main(): + # Configure crawler to fetch SSL certificate + crawl_config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS + ) + + # Create output directory for certificates + output_dir = Path("certificates") + output_dir.mkdir(exist_ok=True) + + async with AsyncWebCrawler() as crawler: + # Crawl a website + result = await crawler.arun( + url='https://example.com', + config=crawl_config + ) + + if result.success and result.ssl_certificate: + # 1. Export as JSON (human-readable format) + json_data = CertificateExporter.to_json( + result.ssl_certificate, + filepath=str(output_dir / "cert.json") + ) + + # 2. Export as PEM (standard text format, used by Apache/Nginx) + pem_data = CertificateExporter.to_pem( + result.ssl_certificate, + filepath=str(output_dir / "cert.pem") + ) + + # 3. Export as DER (binary format, used by Java) + der_data = CertificateExporter.to_der( + result.ssl_certificate, + filepath=str(output_dir / "cert.der") + ) + + # 4. Export all formats at once + export_paths = CertificateExporter.export_all( + result.ssl_certificate, + str(output_dir), + "certificate" + ) + + print("Certificate exported in multiple formats:") + for fmt, path in export_paths.items(): + print(f"- {fmt.upper()}: {path}") + + # Print some certificate information + cert = result.ssl_certificate + print("\nCertificate Information:") + print(f"Subject: {cert['subject']}") + print(f"Issuer: {cert['issuer']}") + print(f"Valid from: {cert['not_before']}") + print(f"Valid until: {cert['not_after']}") + print(f"Fingerprint: {cert['fingerprint']}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/main.py b/main.py index 21d3de16..21e411d0 100644 --- a/main.py +++ b/main.py @@ -351,8 +351,8 @@ async def verify_token(credentials: HTTPAuthorizationCredentials = Security(secu raise HTTPException(status_code=401, detail="Invalid token") return credentials -# Helper function to conditionally apply security def secure_endpoint(): + """Returns security dependency only if CRAWL4AI_API_TOKEN is set""" return Depends(verify_token) if CRAWL4AI_API_TOKEN else None # Check if site directory exists @@ -379,13 +379,12 @@ def read_root(): # Return a json response return {"message": "Crawl4AI API service is running"} - -@app.post("/crawl", dependencies=[Depends(verify_token)]) +@app.post("/crawl", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []) async def crawl(request: CrawlRequest) -> Dict[str, str]: task_id = await crawler_service.submit_task(request) return {"task_id": task_id} -@app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) +@app.get("/task/{task_id}", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []) async def get_task_status(task_id: str): task_info = crawler_service.task_manager.get_task(task_id) if not task_info: @@ -407,7 +406,7 @@ async def get_task_status(task_id: str): return response -@app.post("/crawl_sync", dependencies=[Depends(verify_token)]) +@app.post("/crawl_sync", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []) async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: task_id = await crawler_service.submit_task(request) @@ -431,7 +430,7 @@ async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: # If we get here, task didn't complete within timeout raise HTTPException(status_code=408, detail="Task timed out") -@app.post("/crawl_direct", dependencies=[Depends(verify_token)]) +@app.post("/crawl_direct", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []) async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: try: crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) diff --git a/requirements.txt b/requirements.txt index 741e12ef..fc616d5b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ rank-bm25~=0.2 aiofiles>=24.1.0 colorama~=0.4 snowballstemmer~=2.2 -pydantic>=2.10 \ No newline at end of file +pydantic>=2.10 +pyOpenSSL>=24.3.0 \ No newline at end of file diff --git a/tests/test_cli_docs.py b/tests/test_cli_docs.py new file mode 100644 index 00000000..9d2a7841 --- /dev/null +++ b/tests/test_cli_docs.py @@ -0,0 +1,43 @@ +import asyncio +from pathlib import Path +from crawl4ai.docs_manager import DocsManager +from click.testing import CliRunner +from crawl4ai.cli import cli + +def test_cli(): + """Test all CLI commands""" + runner = CliRunner() + + print("\n1. Testing docs update...") + # Use sync version for testing + docs_manager = DocsManager() + loop = asyncio.get_event_loop() + loop.run_until_complete(docs_manager.fetch_docs()) + + # print("\n2. Testing listing...") + # result = runner.invoke(cli, ['docs', 'list']) + # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}") + # print(result.output) + + # print("\n2. Testing index building...") + # result = runner.invoke(cli, ['docs', 'index']) + # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}") + # print(f"Output: {result.output}") + + # print("\n3. Testing search...") + # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index']) + # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}") + # print(f"First 200 chars: {result.output[:200]}...") + + # print("\n4. Testing combine with sections...") + # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended']) + # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}") + # print(f"First 200 chars: {result.output[:200]}...") + + print("\n5. Testing combine all sections...") + result = runner.invoke(cli, ['docs', 'combine', '--mode', 'condensed']) + print(f"Status: {'✅' if result.exit_code == 0 else '❌'}") + print(f"First 200 chars: {result.output[:200]}...") + +if __name__ == "__main__": + test_cli() \ No newline at end of file diff --git a/tests/test_llmtxt.py b/tests/test_llmtxt.py new file mode 100644 index 00000000..bdbe5c27 --- /dev/null +++ b/tests/test_llmtxt.py @@ -0,0 +1,49 @@ +from crawl4ai.llmtxt import AsyncLLMTextManager # Changed to AsyncLLMTextManager +from crawl4ai.async_logger import AsyncLogger +from pathlib import Path +import asyncio + +async def main(): + current_file = Path(__file__).resolve() + # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs" + base_dir = current_file.parent.parent / "local/_docs/llm.txt" + docs_dir = base_dir + + # Create directory if it doesn't exist + docs_dir.mkdir(parents=True, exist_ok=True) + + # Initialize logger + logger = AsyncLogger() + # Updated initialization with default batching params + # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2) + manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2) + + # Let's first check what files we have + print("\nAvailable files:") + for f in docs_dir.glob("*.md"): + print(f"- {f.name}") + + # Generate index files + print("\nGenerating index files...") + await manager.generate_index_files( + force_generate_facts=False, + clear_bm25_cache=False + ) + + # Test some relevant queries about Crawl4AI + test_queries = [ + "How is using the `arun_many` method?", + ] + + print("\nTesting search functionality:") + for query in test_queries: + print(f"\nQuery: {query}") + results = manager.search(query, top_k=2) + print(f"Results length: {len(results)} characters") + if results: + print("First 200 chars of results:", results[:200].replace('\n', ' '), "...") + else: + print("No results found") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file