diff --git a/.codeiumignore b/.codeiumignore new file mode 100644 index 00000000..76ff6caa --- /dev/null +++ b/.codeiumignore @@ -0,0 +1,220 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +Crawl4AI.egg-info/ +Crawl4AI.egg-info/* +crawler_data.db +.vscode/ +.tests/ +.test_pads/ +test_pad.py +test_pad*.py +.data/ +Crawl4AI.egg-info/ + +requirements0.txt +a.txt + +*.sh +.idea +docs/examples/.chainlit/ +docs/examples/.chainlit/* +.chainlit/config.toml +.chainlit/translations/en-US.json + +local/ +.files/ + +a.txt +.lambda_function.py +ec2* + +update_changelog.sh + +.DS_Store +docs/.DS_Store +tmp/ +test_env/ +**/.DS_Store +**/.DS_Store + +todo.md +todo_executor.md +git_changes.py +git_changes.md +pypi_build.sh +git_issues.py +git_issues.md + +.next/ +.tests/ +.docs/ +.gitboss/ +todo_executor.md +protect-all-except-feature.sh +manage-collab.sh +publish.sh +combine.sh +combined_output.txt +tree.md + diff --git a/.gitignore b/.gitignore index 5bb88ae2..6a3b65f0 100644 --- a/.gitignore +++ b/.gitignore @@ -208,7 +208,7 @@ git_issues.md .next/ .tests/ -.issues/ +# .issues/ .docs/ .issues/ .gitboss/ @@ -220,4 +220,8 @@ combine.sh combined_output.txt .local .scripts -tree.md \ No newline at end of file +tree.md +tree.md +.scripts +.local +.do diff --git a/CHANGELOG.md b/CHANGELOG.md index 58dacf81..ccd1e098 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,19 +1,86 @@ # Changelog -## [0.4.1] December 8, 2024 +All notable changes to Crawl4AI will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.4.24] - 2024-12-31 + +### Added +- **Browser and SSL Handling** + - SSL certificate validation options in extraction strategies + - Custom certificate paths support + - Configurable certificate validation skipping + - Enhanced response status code handling with retry logic + +- **Content Processing** + - New content filtering system with regex support + - Advanced chunking strategies for large content + - Memory-efficient parallel processing + - Configurable chunk size optimization + +- **JSON Extraction** + - Complex JSONPath expression support + - JSON-LD and Microdata extraction + - RDFa parsing capabilities + - Advanced data transformation pipeline + +- **Field Types** + - New field types: `computed`, `conditional`, `aggregate`, `template` + - Field inheritance system + - Reusable field definitions + - Custom validation rules + +### Changed +- **Performance** + - Optimized selector compilation with caching + - Improved HTML parsing efficiency + - Enhanced memory management for large documents + - Batch processing optimizations + +- **Error Handling** + - More detailed error messages and categorization + - Enhanced debugging capabilities + - Improved performance metrics tracking + - Better error recovery mechanisms + +### Deprecated +- Old field computation method using `eval` +- Direct browser manipulation without proper SSL handling +- Simple text-based content filtering + +### Removed +- Legacy extraction patterns without proper error handling +- Unsafe eval-based field computation +- Direct DOM manipulation without sanitization + +### Fixed +- Memory leaks in large document processing +- SSL certificate validation issues +- Incorrect handling of nested JSON structures +- Performance bottlenecks in parallel processing + +### Security +- Improved input validation and sanitization +- Safe expression evaluation system +- Enhanced resource protection +- Rate limiting implementation + +## [0.4.1] - 2024-12-08 ### **File: `crawl4ai/async_crawler_strategy.py`** #### **New Parameters and Attributes Added** -- **`text_only` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering. +- **`text_mode` (boolean)**: Enables text-only mode, disables images, JavaScript, and GPU-related features for faster, minimal rendering. - **`light_mode` (boolean)**: Optimizes the browser by disabling unnecessary background processes and features for efficiency. -- **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_only` mode (default values: 800x600 for `text_only`, 1920x1080 otherwise). -- **`extra_args`**: Adds browser-specific flags for `text_only` mode. +- **`viewport_width` and `viewport_height`**: Dynamically adjusts based on `text_mode` mode (default values: 800x600 for `text_mode`, 1920x1080 otherwise). +- **`extra_args`**: Adds browser-specific flags for `text_mode` mode. - **`adjust_viewport_to_content`**: Dynamically adjusts the viewport to the content size for accurate rendering. #### **Browser Context Adjustments** -- Added **`viewport` adjustments**: Dynamically computed based on `text_only` or custom configuration. -- Enhanced support for `light_mode` and `text_only` by adding specific browser arguments to reduce resource consumption. +- Added **`viewport` adjustments**: Dynamically computed based on `text_mode` or custom configuration. +- Enhanced support for `light_mode` and `text_mode` by adding specific browser arguments to reduce resource consumption. #### **Dynamic Content Handling** - **Full Page Scan Feature**: @@ -709,7 +776,7 @@ This commit introduces several key enhancements, including improved error handli - Improved `AsyncPlaywrightCrawlerStrategy.close()` method to use a shorter sleep time (0.5 seconds instead of 500), significantly reducing wait time when closing the crawler. - Enhanced flexibility in `CosineStrategy`: - Now uses a more generic `load_HF_embedding_model` function, allowing for easier swapping of embedding models. -- Updated `JsonCssExtractionStrategy` and `JsonXPATHExtractionStrategy` for better JSON-based extraction. +- Updated `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy` for better JSON-based extraction. ### Fixed - Addressed potential issues with the sliding window chunking strategy to ensure all text is properly chunked. @@ -980,6 +1047,6 @@ These changes focus on refining the existing codebase, resulting in a more stabl - Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. - Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!). -## [0.2.4] - 2024-06-17 +## [v0.2.4] - 2024-06-17 ### Fixed - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs diff --git a/README.md b/README.md index bf42cf9a..29bae309 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.4.2](#-recent-updates) +[✨ Check out latest update v0.4.24](#-recent-updates) -🎉 **Version 0.4.2 is out!** Introducing our experimental PruningContentFilter - a powerful new algorithm for smarter Markdown generation. Test it out and [share your feedback](https://github.com/unclecode/crawl4ai/issues)! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) +🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) ## 🧐 Why Crawl4AI? @@ -626,19 +626,15 @@ async def test_news_crawl(): ## ✨ Recent Updates -- 🔧 **Configurable Crawlers and Browsers**: Simplified crawling with `BrowserConfig` and `CrawlerRunConfig`, making setups cleaner and more scalable. -- 🔐 **Session Management Enhancements**: Import/export local storage for personalized crawling with seamless session reuse. -- 📸 **Supercharged Screenshots**: Take lightning-fast, full-page screenshots of very long pages. -- 📜 **Full-Page PDF Export**: Convert any web page into a PDF for easy sharing or archiving. -- 🖼️ **Lazy Load Handling**: Improved support for websites with lazy-loaded images. The crawler now waits for all images to fully load, ensuring no content is missed. -- ⚡ **Text-Only Mode**: New mode for fast, lightweight crawling. Disables images, JavaScript, and GPU rendering, improving speed by 3-4x for text-focused crawls. -- 📐 **Dynamic Viewport Adjustment**: Automatically adjusts the browser viewport to fit page content, ensuring accurate rendering and capturing of all elements. -- 🔄 **Full-Page Scanning**: Added scrolling support for pages with infinite scroll or dynamic content loading. Ensures every part of the page is captured. -- 🧑‍💻 **Session Reuse**: Introduced `create_session` for efficient crawling by reusing the same browser session across multiple requests. -- 🌟 **Light Mode**: Optimized browser performance by disabling unnecessary features like extensions, background timers, and sync processes. +- 🔒 **Enhanced SSL & Security**: New SSL certificate handling with custom paths and validation options for secure crawling +- 🔍 **Smart Content Filtering**: Advanced filtering system with regex support and efficient chunking strategies +- 📦 **Improved JSON Extraction**: Support for complex JSONPath, JSON-LD, and Microdata extraction +- 🏗️ **New Field Types**: Added `computed`, `conditional`, `aggregate`, and `template` field types +- ⚡ **Performance Boost**: Optimized caching, parallel processing, and memory management +- 🐛 **Better Error Handling**: Enhanced debugging capabilities with detailed error tracking +- 🔐 **Security Features**: Improved input validation and safe expression evaluation - -Read the full details of this release in our [0.4.2 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/blog/releases/0.4.2.md). +Read the full details of this release in our [0.4.24 Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md). ## 📖 Documentation & Roadmap diff --git a/a.md b/a.md deleted file mode 100644 index 4d68148f..00000000 --- a/a.md +++ /dev/null @@ -1,4214 +0,0 @@ -diff --git a/.gitignore b/.gitignore -index 02c75b3..432b5aa 100644 ---- a/.gitignore -+++ b/.gitignore -@@ -206,6 +206,7 @@ pypi_build.sh - git_issues.py - git_issues.md - -+.next/ - .tests/ - .issues/ - .docs/ -diff --git a/README.sync.md b/README.sync.md -deleted file mode 100644 -index 6bbef7e..0000000 ---- a/README.sync.md -+++ /dev/null -@@ -1,244 +0,0 @@ --# Crawl4AI v0.2.77 🕷️🤖 -- --[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) --[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) --[![GitHub Issues](https://img.shields.io/github/issues/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/issues) --[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/pulls) --[![License](https://img.shields.io/github/license/unclecode/crawl4ai)](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) -- --Crawl4AI simplifies web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -- --#### [v0.2.77] - 2024-08-02 -- --Major improvements in functionality, performance, and cross-platform compatibility! 🚀 -- --- 🐳 **Docker enhancements**: -- - Significantly improved Dockerfile for easy installation on Linux, Mac, and Windows. --- 🌐 **Official Docker Hub image**: -- - Launched our first official image on Docker Hub for streamlined deployment (unclecode/crawl4ai). --- 🔧 **Selenium upgrade**: -- - Removed dependency on ChromeDriver, now using Selenium's built-in capabilities for better compatibility. --- 🖼️ **Image description**: -- - Implemented ability to generate textual descriptions for extracted images from web pages. --- ⚡ **Performance boost**: -- - Various improvements to enhance overall speed and performance. -- --## Try it Now! -- --✨ Play around with this [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sJPAmeLj5PMrg2VgOwMJ2ubGIcK0cJeX?usp=sharing) -- --✨ visit our [Documentation Website](https://crawl4ai.com/mkdocs/) -- --✨ Check [Demo](https://crawl4ai.com/mkdocs/demo) -- --## Features ✨ -- --- 🆓 Completely free and open-source --- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) --- 🌍 Supports crawling multiple URLs simultaneously --- 🎨 Extracts and returns all media tags (Images, Audio, and Video) --- 🔗 Extracts all external and internal links --- 📚 Extracts metadata from the page --- 🔄 Custom hooks for authentication, headers, and page modifications before crawling --- 🕵️ User-agent customization --- 🖼️ Takes screenshots of the page --- 📜 Executes multiple custom JavaScripts before crawling --- 📚 Various chunking strategies: topic-based, regex, sentence, and more --- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more --- 🎯 CSS selector support --- 📝 Passes instructions/keywords to refine extraction -- --# Crawl4AI -- --## 🌟 Shoutout to Contributors of v0.2.77! -- --A big thank you to the amazing contributors who've made this release possible: -- --- [@aravindkarnam](https://github.com/aravindkarnam) for the new image description feature --- [@FractalMind](https://github.com/FractalMind) for our official Docker Hub image --- [@ketonkss4](https://github.com/ketonkss4) for helping streamline our Selenium setup -- --Your contributions are driving Crawl4AI forward! 🚀 -- --## Cool Examples 🚀 -- --### Quick Start -- --```python --from crawl4ai import WebCrawler -- --# Create an instance of WebCrawler --crawler = WebCrawler() -- --# Warm up the crawler (load necessary models) --crawler.warmup() -- --# Run the crawler on a URL --result = crawler.run(url="https://www.nbcnews.com/business") -- --# Print the extracted content --print(result.markdown) --``` -- --## How to install 🛠 -- --### Using pip 🐍 --```bash --virtualenv venv --source venv/bin/activate --pip install "crawl4ai @ git+https://github.com/unclecode/crawl4ai.git" --``` -- --### Using Docker 🐳 -- --```bash --# For Mac users (M1/M2) --# docker build --platform linux/amd64 -t crawl4ai . --docker build -t crawl4ai . --docker run -d -p 8000:80 crawl4ai --``` -- --### Using Docker Hub 🐳 -- --```bash --docker pull unclecode/crawl4ai:latest --docker run -d -p 8000:80 unclecode/crawl4ai:latest --``` -- -- --## Speed-First Design 🚀 -- --Perhaps the most important design principle for this library is speed. We need to ensure it can handle many links and resources in parallel as quickly as possible. By combining this speed with fast LLMs like Groq, the results will be truly amazing. -- --```python --import time --from crawl4ai.web_crawler import WebCrawler --crawler = WebCrawler() --crawler.warmup() -- --start = time.time() --url = r"https://www.nbcnews.com/business" --result = crawler.run( url, word_count_threshold=10, bypass_cache=True) --end = time.time() --print(f"Time taken: {end - start}") --``` -- --Let's take a look the calculated time for the above code snippet: -- --```bash --[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds --[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds --[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds. --Time taken: 1.439958095550537 --``` --Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀 -- --### Extract Structured Data from Web Pages 📊 -- --Crawl all OpenAI models and their fees from the official page. -- --```python --import os --from crawl4ai import WebCrawler --from crawl4ai.extraction_strategy import LLMExtractionStrategy --from pydantic import BaseModel, Field -- --class OpenAIModelFee(BaseModel): -- model_name: str = Field(..., description="Name of the OpenAI model.") -- input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") -- output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.") -- --url = 'https://openai.com/api/pricing/' --crawler = WebCrawler() --crawler.warmup() -- --result = crawler.run( -- url=url, -- word_count_threshold=1, -- extraction_strategy= LLMExtractionStrategy( -- provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), -- schema=OpenAIModelFee.schema(), -- extraction_type="schema", -- instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. -- Do not miss any models in the entire content. One extracted model JSON format should look like this: -- {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""" -- ), -- bypass_cache=True, -- ) -- --print(result.extracted_content) --``` -- --### Execute JS, Filter Data with CSS Selector, and Clustering -- --```python --from crawl4ai import WebCrawler --from crawl4ai.chunking_strategy import CosineStrategy -- --js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"] -- --crawler = WebCrawler() --crawler.warmup() -- --result = crawler.run( -- url="https://www.nbcnews.com/business", -- js=js_code, -- css_selector="p", -- extraction_strategy=CosineStrategy(semantic_filter="technology") --) -- --print(result.extracted_content) --``` -- --### Extract Structured Data from Web Pages With Proxy and BaseUrl -- --```python --from crawl4ai import WebCrawler --from crawl4ai.extraction_strategy import LLMExtractionStrategy -- --def create_crawler(): -- crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890") -- crawler.warmup() -- return crawler -- --crawler = create_crawler() -- --crawler.warmup() -- --result = crawler.run( -- url="https://www.nbcnews.com/business", -- extraction_strategy=LLMExtractionStrategy( -- provider="openai/gpt-4o", -- api_token="sk-", -- base_url="https://api.openai.com/v1" -- ) --) -- --print(result.markdown) --``` -- --## Documentation 📚 -- --For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). -- --## Contributing 🤝 -- --We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md) for more information. -- --## License 📄 -- --Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE). -- --## Contact 📧 -- --For questions, suggestions, or feedback, feel free to reach out: -- --- GitHub: [unclecode](https://github.com/unclecode) --- Twitter: [@unclecode](https://twitter.com/unclecode) --- Website: [crawl4ai.com](https://crawl4ai.com) -- --Happy Crawling! 🕸️🚀 -- --## Star History -- --[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) -\ No newline at end of file -diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py -index cee7c25..d297dfc 100644 ---- a/crawl4ai/__init__.py -+++ b/crawl4ai/__init__.py -@@ -1,7 +1,11 @@ - # __init__.py - - from .async_webcrawler import AsyncWebCrawler, CacheMode -- -+from .async_configs import BrowserConfig, CrawlerRunConfig -+from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy -+from .chunking_strategy import ChunkingStrategy, RegexChunking -+from .markdown_generation_strategy import DefaultMarkdownGenerator -+from .content_filter_strategy import PruningContentFilter, BM25ContentFilter - from .models import CrawlResult - from .__version__ import __version__ - -@@ -9,6 +13,17 @@ __all__ = [ - "AsyncWebCrawler", - "CrawlResult", - "CacheMode", -+ 'BrowserConfig', -+ 'CrawlerRunConfig', -+ 'ExtractionStrategy', -+ 'LLMExtractionStrategy', -+ 'CosineStrategy', -+ 'JsonCssExtractionStrategy', -+ 'ChunkingStrategy', -+ 'RegexChunking', -+ 'DefaultMarkdownGenerator', -+ 'PruningContentFilter', -+ 'BM25ContentFilter', - ] - - def is_sync_version_installed(): -diff --git a/crawl4ai/async_crawler_strategy.current.py b/crawl4ai/async_crawler_strategy.current.py -deleted file mode 100644 -index 6302447..0000000 ---- a/crawl4ai/async_crawler_strategy.current.py -+++ /dev/null -@@ -1,1475 +0,0 @@ --import asyncio --import base64 --import time --from abc import ABC, abstractmethod --from typing import Callable, Dict, Any, List, Optional, Awaitable --import os, sys, shutil --import tempfile, subprocess --from playwright.async_api import async_playwright, Page, Browser, Error --from playwright.async_api import TimeoutError as PlaywrightTimeoutError --from io import BytesIO --from PIL import Image, ImageDraw, ImageFont --from pathlib import Path --from playwright.async_api import ProxySettings --from pydantic import BaseModel --import hashlib --import json --import uuid --from .models import AsyncCrawlResponse --from .utils import create_box_message --from .user_agent_generator import UserAgentGenerator --from playwright_stealth import StealthConfig, stealth_async -- --stealth_config = StealthConfig( -- webdriver=True, -- chrome_app=True, -- chrome_csi=True, -- chrome_load_times=True, -- chrome_runtime=True, -- navigator_languages=True, -- navigator_plugins=True, -- navigator_permissions=True, -- webgl_vendor=True, -- outerdimensions=True, -- navigator_hardware_concurrency=True, -- media_codecs=True, --) -- --BROWSER_DISABLE_OPTIONS = [ -- "--disable-background-networking", -- "--disable-background-timer-throttling", -- "--disable-backgrounding-occluded-windows", -- "--disable-breakpad", -- "--disable-client-side-phishing-detection", -- "--disable-component-extensions-with-background-pages", -- "--disable-default-apps", -- "--disable-extensions", -- "--disable-features=TranslateUI", -- "--disable-hang-monitor", -- "--disable-ipc-flooding-protection", -- "--disable-popup-blocking", -- "--disable-prompt-on-repost", -- "--disable-sync", -- "--force-color-profile=srgb", -- "--metrics-recording-only", -- "--no-first-run", -- "--password-store=basic", -- "--use-mock-keychain" --] -- -- --class ManagedBrowser: -- def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): -- self.browser_type = browser_type -- self.user_data_dir = user_data_dir -- self.headless = headless -- self.browser_process = None -- self.temp_dir = None -- self.debugging_port = debugging_port -- self.host = host -- self.logger = logger -- self.shutting_down = False -- -- async def start(self) -> str: -- """ -- Starts the browser process and returns the CDP endpoint URL. -- If user_data_dir is not provided, creates a temporary directory. -- """ -- -- # Create temp dir if needed -- if not self.user_data_dir: -- self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") -- self.user_data_dir = self.temp_dir -- -- # Get browser path and args based on OS and browser type -- browser_path = self._get_browser_path() -- args = self._get_browser_args() -- -- # Start browser process -- try: -- self.browser_process = subprocess.Popen( -- args, -- stdout=subprocess.PIPE, -- stderr=subprocess.PIPE -- ) -- # Monitor browser process output for errors -- asyncio.create_task(self._monitor_browser_process()) -- await asyncio.sleep(2) # Give browser time to start -- return f"http://{self.host}:{self.debugging_port}" -- except Exception as e: -- await self.cleanup() -- raise Exception(f"Failed to start browser: {e}") -- -- async def _monitor_browser_process(self): -- """Monitor the browser process for unexpected termination.""" -- if self.browser_process: -- try: -- stdout, stderr = await asyncio.gather( -- asyncio.to_thread(self.browser_process.stdout.read), -- asyncio.to_thread(self.browser_process.stderr.read) -- ) -- -- # Check shutting_down flag BEFORE logging anything -- if self.browser_process.poll() is not None: -- if not self.shutting_down: -- self.logger.error( -- message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", -- tag="ERROR", -- params={ -- "code": self.browser_process.returncode, -- "stdout": stdout.decode(), -- "stderr": stderr.decode() -- } -- ) -- await self.cleanup() -- else: -- self.logger.info( -- message="Browser process terminated normally | Code: {code}", -- tag="INFO", -- params={"code": self.browser_process.returncode} -- ) -- except Exception as e: -- if not self.shutting_down: -- self.logger.error( -- message="Error monitoring browser process: {error}", -- tag="ERROR", -- params={"error": str(e)} -- ) -- -- def _get_browser_path(self) -> str: -- """Returns the browser executable path based on OS and browser type""" -- if sys.platform == "darwin": # macOS -- paths = { -- "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", -- "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", -- "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" -- } -- elif sys.platform == "win32": # Windows -- paths = { -- "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", -- "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", -- "webkit": None # WebKit not supported on Windows -- } -- else: # Linux -- paths = { -- "chromium": "google-chrome", -- "firefox": "firefox", -- "webkit": None # WebKit not supported on Linux -- } -- -- return paths.get(self.browser_type) -- -- def _get_browser_args(self) -> List[str]: -- """Returns browser-specific command line arguments""" -- base_args = [self._get_browser_path()] -- -- if self.browser_type == "chromium": -- args = [ -- f"--remote-debugging-port={self.debugging_port}", -- f"--user-data-dir={self.user_data_dir}", -- ] -- if self.headless: -- args.append("--headless=new") -- elif self.browser_type == "firefox": -- args = [ -- "--remote-debugging-port", str(self.debugging_port), -- "--profile", self.user_data_dir, -- ] -- if self.headless: -- args.append("--headless") -- else: -- raise NotImplementedError(f"Browser type {self.browser_type} not supported") -- -- return base_args + args -- -- async def cleanup(self): -- """Cleanup browser process and temporary directory""" -- # Set shutting_down flag BEFORE any termination actions -- self.shutting_down = True -- -- if self.browser_process: -- try: -- self.browser_process.terminate() -- # Wait for process to end gracefully -- for _ in range(10): # 10 attempts, 100ms each -- if self.browser_process.poll() is not None: -- break -- await asyncio.sleep(0.1) -- -- # Force kill if still running -- if self.browser_process.poll() is None: -- self.browser_process.kill() -- await asyncio.sleep(0.1) # Brief wait for kill to take effect -- -- except Exception as e: -- self.logger.error( -- message="Error terminating browser: {error}", -- tag="ERROR", -- params={"error": str(e)} -- ) -- -- if self.temp_dir and os.path.exists(self.temp_dir): -- try: -- shutil.rmtree(self.temp_dir) -- except Exception as e: -- self.logger.error( -- message="Error removing temporary directory: {error}", -- tag="ERROR", -- params={"error": str(e)} -- ) -- -- --class AsyncCrawlerStrategy(ABC): -- @abstractmethod -- async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: -- pass -- -- @abstractmethod -- async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: -- pass -- -- @abstractmethod -- async def take_screenshot(self, **kwargs) -> str: -- pass -- -- @abstractmethod -- def update_user_agent(self, user_agent: str): -- pass -- -- @abstractmethod -- def set_hook(self, hook_type: str, hook: Callable): -- pass -- --class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): -- def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): -- self.text_only = kwargs.get("text_only", False) -- self.light_mode = kwargs.get("light_mode", False) -- self.logger = logger -- self.use_cached_html = use_cached_html -- self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) -- self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) -- -- if self.text_only: -- self.extra_args = kwargs.get("extra_args", []) + [ -- '--disable-images', -- '--disable-javascript', -- '--disable-gpu', -- '--disable-software-rasterizer', -- '--disable-dev-shm-usage' -- ] -- -- self.user_agent = kwargs.get( -- "user_agent", -- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" -- # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" -- ) -- user_agenr_generator = UserAgentGenerator() -- if kwargs.get("user_agent_mode") == "random": -- self.user_agent = user_agenr_generator.generate( -- **kwargs.get("user_agent_generator_config", {}) -- ) -- self.proxy = kwargs.get("proxy") -- self.proxy_config = kwargs.get("proxy_config") -- self.headless = kwargs.get("headless", True) -- self.browser_type = kwargs.get("browser_type", "chromium") -- self.headers = kwargs.get("headers", {}) -- self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) -- self.headers.setdefault("sec-ch-ua", self.browser_hint) -- self.cookies = kwargs.get("cookies", []) -- self.storage_state = kwargs.get("storage_state", None) -- self.sessions = {} -- self.session_ttl = 1800 -- self.js_code = js_code -- self.verbose = kwargs.get("verbose", False) -- self.playwright = None -- self.browser = None -- self.sleep_on_close = kwargs.get("sleep_on_close", False) -- self.use_managed_browser = kwargs.get("use_managed_browser", False) -- self.user_data_dir = kwargs.get("user_data_dir", None) -- self.use_persistent_context = kwargs.get("use_persistent_context", False) -- self.chrome_channel = kwargs.get("chrome_channel", "chrome") -- self.managed_browser = None -- self.default_context = None -- self.hooks = { -- 'on_browser_created': None, -- 'on_user_agent_updated': None, -- 'on_execution_started': None, -- 'before_goto': None, -- 'after_goto': None, -- 'before_return_html': None, -- 'before_retrieve_html': None -- } -- self.extra_args = kwargs.get("extra_args", []) -- self.ignore_https_errors = kwargs.get("ignore_https_errors", True) -- self.java_script_enabled = kwargs.get("java_script_enabled", True) -- self.accept_downloads = kwargs.get("accept_downloads", False) -- self.downloads_path = kwargs.get("downloads_path") -- self._downloaded_files = [] # Track downloaded files for current crawl -- if self.accept_downloads and not self.downloads_path: -- self.downloads_path = os.path.join(os.getcwd(), "downloads") -- os.makedirs(self.downloads_path, exist_ok=True) -- -- -- async def __aenter__(self): -- await self.start() -- return self -- -- async def __aexit__(self, exc_type, exc_val, exc_tb): -- await self.close() -- -- async def start(self): -- if self.playwright is None: -- self.playwright = await async_playwright().start() -- if self.browser is None: -- if self.use_managed_browser: -- # Use managed browser approach -- self.managed_browser = ManagedBrowser( -- browser_type=self.browser_type, -- user_data_dir=self.user_data_dir, -- headless=self.headless, -- logger=self.logger -- ) -- cdp_url = await self.managed_browser.start() -- self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) -- -- # Get the default context that maintains the user profile -- contexts = self.browser.contexts -- if contexts: -- self.default_context = contexts[0] -- else: -- # If no default context exists, create one -- self.default_context = await self.browser.new_context( -- viewport={"width": self.viewport_width, "height": self.viewport_height}, -- storage_state=self.storage_state, -- user_agent= self.user_agent, -- accept_downloads=self.accept_downloads, -- ignore_https_errors=self.ignore_https_errors, -- java_script_enabled=self.java_script_enabled, -- ) -- -- # Set up the default context -- if self.default_context: -- await self.default_context.set_extra_http_headers(self.headers) -- if self.cookies: -- await self.default_context.add_cookies(self.cookies) -- if self.storage_state: -- # If storage_state is a dictionary or file path, Playwright will handle it. -- await self.default_context.storage_state(path=None) # Just ensuring default_context is ready -- if self.accept_downloads: -- await self.default_context.set_default_timeout(60000) -- await self.default_context.set_default_navigation_timeout(60000) -- self.default_context._impl_obj._options["accept_downloads"] = True -- self.default_context._impl_obj._options["downloads_path"] = self.downloads_path -- -- if self.user_agent: -- await self.default_context.set_extra_http_headers({ -- "User-Agent": self.user_agent, -- "sec-ch-ua": self.browser_hint, -- # **self.headers -- }) -- else: -- # Base browser arguments -- browser_args = { -- "headless": self.headless, -- "args": [ -- "--no-sandbox", -- "--disable-dev-shm-usage", -- "--no-first-run", -- "--no-default-browser-check", -- "--disable-infobars", -- "--window-position=0,0", -- "--ignore-certificate-errors", -- "--ignore-certificate-errors-spki-list", -- "--disable-blink-features=AutomationControlled", -- "--window-position=400,0", -- f"--window-size={self.viewport_width},{self.viewport_height}", -- ] -- } -- -- if self.light_mode: -- browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) -- -- if self.text_only: -- browser_args["args"].extend([ -- '--blink-settings=imagesEnabled=false', -- '--disable-remote-fonts' -- ]) -- -- # Add channel if specified (try Chrome first) -- if self.chrome_channel: -- browser_args["channel"] = self.chrome_channel -- -- # Add extra args if provided -- if self.extra_args: -- browser_args["args"].extend(self.extra_args) -- -- # Add downloads path if downloads are enabled -- if self.accept_downloads: -- browser_args["downloads_path"] = self.downloads_path -- -- # Add proxy settings if a proxy is specified -- if self.proxy: -- proxy_settings = ProxySettings(server=self.proxy) -- browser_args["proxy"] = proxy_settings -- elif self.proxy_config: -- proxy_settings = ProxySettings( -- server=self.proxy_config.get("server"), -- username=self.proxy_config.get("username"), -- password=self.proxy_config.get("password") -- ) -- browser_args["proxy"] = proxy_settings -- -- try: -- # Select the appropriate browser based on the browser_type -- if self.browser_type == "firefox": -- self.browser = await self.playwright.firefox.launch(**browser_args) -- elif self.browser_type == "webkit": -- if "viewport" not in browser_args: -- browser_args["viewport"] = {"width": self.viewport_width, "height": self.viewport_height} -- self.browser = await self.playwright.webkit.launch(**browser_args) -- else: -- if self.use_persistent_context and self.user_data_dir: -- self.browser = await self.playwright.chromium.launch_persistent_context( -- user_data_dir=self.user_data_dir, -- accept_downloads=self.accept_downloads, -- downloads_path=self.downloads_path if self.accept_downloads else None, -- **browser_args -- ) -- self.default_context = self.browser -- else: -- self.browser = await self.playwright.chromium.launch(**browser_args) -- self.default_context = self.browser -- -- except Exception as e: -- # Fallback to chromium if Chrome channel fails -- if "chrome" in str(e) and browser_args.get("channel") == "chrome": -- browser_args["channel"] = "chromium" -- if self.use_persistent_context and self.user_data_dir: -- self.browser = await self.playwright.chromium.launch_persistent_context( -- user_data_dir=self.user_data_dir, -- **browser_args -- ) -- self.default_context = self.browser -- else: -- self.browser = await self.playwright.chromium.launch(**browser_args) -- else: -- raise -- -- await self.execute_hook('on_browser_created', self.browser) -- -- async def close(self): -- if self.sleep_on_close: -- await asyncio.sleep(0.5) -- -- # Close all active sessions -- session_ids = list(self.sessions.keys()) -- for session_id in session_ids: -- await self.kill_session(session_id) -- -- if self.browser: -- await self.browser.close() -- self.browser = None -- -- if self.managed_browser: -- await asyncio.sleep(0.5) -- await self.managed_browser.cleanup() -- self.managed_browser = None -- -- if self.playwright: -- await self.playwright.stop() -- self.playwright = None -- -- # Issue #256: Remove __del__ method to avoid potential issues with async cleanup -- # def __del__(self): -- # if self.browser or self.playwright: -- # asyncio.get_event_loop().run_until_complete(self.close()) -- -- def set_hook(self, hook_type: str, hook: Callable): -- if hook_type in self.hooks: -- self.hooks[hook_type] = hook -- else: -- raise ValueError(f"Invalid hook type: {hook_type}") -- -- async def execute_hook(self, hook_type: str, *args, **kwargs): -- hook = self.hooks.get(hook_type) -- if hook: -- if asyncio.iscoroutinefunction(hook): -- return await hook(*args, **kwargs) -- else: -- return hook(*args, **kwargs) -- return args[0] if args else None -- -- def update_user_agent(self, user_agent: str): -- self.user_agent = user_agent -- -- def set_custom_headers(self, headers: Dict[str, str]): -- self.headers = headers -- -- async def kill_session(self, session_id: str): -- if session_id in self.sessions: -- context, page, _ = self.sessions[session_id] -- await page.close() -- if not self.use_managed_browser: -- await context.close() -- del self.sessions[session_id] -- -- def _cleanup_expired_sessions(self): -- current_time = time.time() -- expired_sessions = [ -- sid for sid, (_, _, last_used) in self.sessions.items() -- if current_time - last_used > self.session_ttl -- ] -- for sid in expired_sessions: -- asyncio.create_task(self.kill_session(sid)) -- -- async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): -- wait_for = wait_for.strip() -- -- if wait_for.startswith('js:'): -- # Explicitly specified JavaScript -- js_code = wait_for[3:].strip() -- return await self.csp_compliant_wait(page, js_code, timeout) -- elif wait_for.startswith('css:'): -- # Explicitly specified CSS selector -- css_selector = wait_for[4:].strip() -- try: -- await page.wait_for_selector(css_selector, timeout=timeout) -- except Error as e: -- if 'Timeout' in str(e): -- raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") -- else: -- raise ValueError(f"Invalid CSS selector: '{css_selector}'") -- else: -- # Auto-detect based on content -- if wait_for.startswith('()') or wait_for.startswith('function'): -- # It's likely a JavaScript function -- return await self.csp_compliant_wait(page, wait_for, timeout) -- else: -- # Assume it's a CSS selector first -- try: -- await page.wait_for_selector(wait_for, timeout=timeout) -- except Error as e: -- if 'Timeout' in str(e): -- raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") -- else: -- # If it's not a timeout error, it might be an invalid selector -- # Let's try to evaluate it as a JavaScript function as a fallback -- try: -- return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) -- except Error: -- raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " -- "It should be either a valid CSS selector, a JavaScript function, " -- "or explicitly prefixed with 'js:' or 'css:'.") -- -- async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): -- wrapper_js = f""" -- async () => {{ -- const userFunction = {user_wait_function}; -- const startTime = Date.now(); -- while (true) {{ -- if (await userFunction()) {{ -- return true; -- }} -- if (Date.now() - startTime > {timeout}) {{ -- throw new Error('Timeout waiting for condition'); -- }} -- await new Promise(resolve => setTimeout(resolve, 100)); -- }} -- }} -- """ -- -- try: -- await page.evaluate(wrapper_js) -- except TimeoutError: -- raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") -- except Exception as e: -- raise RuntimeError(f"Error in wait condition: {str(e)}") -- -- async def process_iframes(self, page): -- # Find all iframes -- iframes = await page.query_selector_all('iframe') -- -- for i, iframe in enumerate(iframes): -- try: -- # Add a unique identifier to the iframe -- await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') -- -- # Get the frame associated with this iframe -- frame = await iframe.content_frame() -- -- if frame: -- # Wait for the frame to load -- await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout -- -- # Extract the content of the iframe's body -- iframe_content = await frame.evaluate('() => document.body.innerHTML') -- -- # Generate a unique class name for this iframe -- class_name = f'extracted-iframe-content-{i}' -- -- # Replace the iframe with a div containing the extracted content -- _iframe = iframe_content.replace('`', '\\`') -- await page.evaluate(f""" -- () => {{ -- const iframe = document.getElementById('iframe-{i}'); -- const div = document.createElement('div'); -- div.innerHTML = `{_iframe}`; -- div.className = '{class_name}'; -- iframe.replaceWith(div); -- }} -- """) -- else: -- # print(f"Warning: Could not access content frame for iframe {i}") -- self.logger.warning( -- message="Could not access content frame for iframe {index}", -- tag="SCRAPE", -- params={"index": i} -- ) -- except Exception as e: -- self.logger.error( -- message="Error processing iframe {index}: {error}", -- tag="ERROR", -- params={"index": i, "error": str(e)} -- ) -- # print(f"Error processing iframe {i}: {str(e)}") -- -- # Return the page object -- return page -- -- async def create_session(self, **kwargs) -> str: -- """Creates a new browser session and returns its ID.""" -- if not self.browser: -- await self.start() -- -- session_id = kwargs.get('session_id') or str(uuid.uuid4()) -- -- if self.use_managed_browser: -- page = await self.default_context.new_page() -- self.sessions[session_id] = (self.default_context, page, time.time()) -- else: -- if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: -- context = self.browser -- page = await context.new_page() -- else: -- context = await self.browser.new_context( -- user_agent=kwargs.get("user_agent", self.user_agent), -- viewport={"width": self.viewport_width, "height": self.viewport_height}, -- proxy={"server": self.proxy} if self.proxy else None, -- accept_downloads=self.accept_downloads, -- storage_state=self.storage_state, -- ignore_https_errors=True -- ) -- -- if self.cookies: -- await context.add_cookies(self.cookies) -- await context.set_extra_http_headers(self.headers) -- page = await context.new_page() -- -- self.sessions[session_id] = (context, page, time.time()) -- -- return session_id -- -- async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: -- """ -- Crawls a given URL or processes raw HTML/local file content based on the URL prefix. -- -- Args: -- url (str): The URL to crawl. Supported prefixes: -- - 'http://' or 'https://': Web URL to crawl. -- - 'file://': Local file path to process. -- - 'raw:': Raw HTML content to process. -- **kwargs: Additional parameters: -- - 'screenshot' (bool): Whether to take a screenshot. -- - ... [other existing parameters] -- -- Returns: -- AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. -- """ -- response_headers = {} -- status_code = 200 # Default to 200 for local/raw HTML -- screenshot_requested = kwargs.get('screenshot', False) -- screenshot_data = None -- -- if url.startswith(('http://', 'https://')): -- # Proceed with standard web crawling -- return await self._crawl_web(url, **kwargs) -- -- elif url.startswith('file://'): -- # Process local file -- local_file_path = url[7:] # Remove 'file://' prefix -- if not os.path.exists(local_file_path): -- raise FileNotFoundError(f"Local file not found: {local_file_path}") -- with open(local_file_path, 'r', encoding='utf-8') as f: -- html = f.read() -- if screenshot_requested: -- screenshot_data = await self._generate_screenshot_from_html(html) -- return AsyncCrawlResponse( -- html=html, -- response_headers=response_headers, -- status_code=status_code, -- screenshot=screenshot_data, -- get_delayed_content=None -- ) -- -- elif url.startswith('raw:'): -- # Process raw HTML content -- raw_html = url[4:] # Remove 'raw:' prefix -- html = raw_html -- if screenshot_requested: -- screenshot_data = await self._generate_screenshot_from_html(html) -- return AsyncCrawlResponse( -- html=html, -- response_headers=response_headers, -- status_code=status_code, -- screenshot=screenshot_data, -- get_delayed_content=None -- ) -- else: -- raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") -- -- -- async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: -- """ -- Existing web crawling logic remains unchanged. -- -- Args: -- url (str): The web URL to crawl. -- **kwargs: Additional parameters. -- -- Returns: -- AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. -- """ -- response_headers = {} -- status_code = None -- -- # Reset downloaded files list for new crawl -- self._downloaded_files = [] -- -- self._cleanup_expired_sessions() -- session_id = kwargs.get("session_id") -- -- # Check if in kwargs we have user_agent that will override the default user_agent -- user_agent = kwargs.get("user_agent", self.user_agent) -- -- # Generate random user agent if magic mode is enabled and user_agent_mode is not random -- if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): -- user_agent = UserAgentGenerator().generate( -- **kwargs.get("user_agent_generator_config", {}) -- ) -- -- # Handle page creation differently for managed browser -- context = None -- if self.use_managed_browser: -- if session_id: -- # Reuse existing session if available -- context, page, _ = self.sessions.get(session_id, (None, None, None)) -- if not page: -- # Create new page in default context if session doesn't exist -- page = await self.default_context.new_page() -- self.sessions[session_id] = (self.default_context, page, time.time()) -- else: -- # Create new page in default context for non-session requests -- page = await self.default_context.new_page() -- else: -- if session_id: -- context, page, _ = self.sessions.get(session_id, (None, None, None)) -- if not context: -- if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: -- # In persistent context, browser is the context -- context = self.browser -- else: -- # Normal context creation for non-persistent or non-Chrome browsers -- context = await self.browser.new_context( -- user_agent=user_agent, -- viewport={"width": self.viewport_width, "height": self.viewport_height}, -- proxy={"server": self.proxy} if self.proxy else None, -- java_script_enabled=True, -- accept_downloads=self.accept_downloads, -- storage_state=self.storage_state, -- # downloads_path=self.downloads_path if self.accept_downloads else None -- ) -- await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) -- if self.cookies: -- await context.add_cookies(self.cookies) -- await context.set_extra_http_headers(self.headers) -- -- page = await context.new_page() -- self.sessions[session_id] = (context, page, time.time()) -- else: -- if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: -- # In persistent context, browser is the context -- context = self.browser -- else: -- # Normal context creation -- context = await self.browser.new_context( -- user_agent=user_agent, -- # viewport={"width": 1920, "height": 1080}, -- viewport={"width": self.viewport_width, "height": self.viewport_height}, -- proxy={"server": self.proxy} if self.proxy else None, -- accept_downloads=self.accept_downloads, -- storage_state=self.storage_state, -- ignore_https_errors=True # Add this line -- ) -- if self.cookies: -- await context.add_cookies(self.cookies) -- await context.set_extra_http_headers(self.headers) -- -- if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): -- # Inject scripts to override navigator properties -- await context.add_init_script(""" -- // Pass the Permissions Test. -- const originalQuery = window.navigator.permissions.query; -- window.navigator.permissions.query = (parameters) => ( -- parameters.name === 'notifications' ? -- Promise.resolve({ state: Notification.permission }) : -- originalQuery(parameters) -- ); -- Object.defineProperty(navigator, 'webdriver', { -- get: () => undefined -- }); -- window.navigator.chrome = { -- runtime: {}, -- // Add other properties if necessary -- }; -- Object.defineProperty(navigator, 'plugins', { -- get: () => [1, 2, 3, 4, 5], -- }); -- Object.defineProperty(navigator, 'languages', { -- get: () => ['en-US', 'en'], -- }); -- Object.defineProperty(document, 'hidden', { -- get: () => false -- }); -- Object.defineProperty(document, 'visibilityState', { -- get: () => 'visible' -- }); -- """) -- -- page = await context.new_page() -- if kwargs.get("magic", False): -- await stealth_async(page, stealth_config) -- -- # Add console message and error logging -- if kwargs.get("log_console", False): -- page.on("console", lambda msg: print(f"Console: {msg.text}")) -- page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) -- -- try: -- # Set up download handling if enabled -- if self.accept_downloads: -- page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) -- -- if self.use_cached_html: -- cache_file_path = os.path.join( -- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() -- ) -- if os.path.exists(cache_file_path): -- html = "" -- with open(cache_file_path, "r") as f: -- html = f.read() -- # retrieve response headers and status code from cache -- with open(cache_file_path + ".meta", "r") as f: -- meta = json.load(f) -- response_headers = meta.get("response_headers", {}) -- status_code = meta.get("status_code") -- response = AsyncCrawlResponse( -- html=html, response_headers=response_headers, status_code=status_code -- ) -- return response -- -- if not kwargs.get("js_only", False): -- await self.execute_hook('before_goto', page, context = context, **kwargs) -- -- try: -- response = await page.goto( -- url, -- # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), -- wait_until=kwargs.get("wait_until", "domcontentloaded"), -- timeout=kwargs.get("page_timeout", 60000), -- ) -- except Error as e: -- raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") -- -- await self.execute_hook('after_goto', page, context = context, **kwargs) -- -- # Get status code and headers -- status_code = response.status -- response_headers = response.headers -- else: -- status_code = 200 -- response_headers = {} -- -- # Replace the current wait_for_selector line with this more robust check: -- try: -- # First wait for body to exist, regardless of visibility -- await page.wait_for_selector('body', state='attached', timeout=30000) -- -- # Then wait for it to become visible by checking CSS -- await page.wait_for_function(""" -- () => { -- const body = document.body; -- const style = window.getComputedStyle(body); -- return style.display !== 'none' && -- style.visibility !== 'hidden' && -- style.opacity !== '0'; -- } -- """, timeout=30000) -- -- except Error as e: -- # If waiting fails, let's try to diagnose the issue -- visibility_info = await page.evaluate(""" -- () => { -- const body = document.body; -- const style = window.getComputedStyle(body); -- return { -- display: style.display, -- visibility: style.visibility, -- opacity: style.opacity, -- hasContent: body.innerHTML.length, -- classList: Array.from(body.classList) -- } -- } -- """) -- -- if self.verbose: -- print(f"Body visibility debug info: {visibility_info}") -- -- # Even if body is hidden, we might still want to proceed -- if kwargs.get('ignore_body_visibility', True): -- if self.verbose: -- print("Proceeding despite hidden body...") -- pass -- else: -- raise Error(f"Body element is hidden: {visibility_info}") -- -- # CONTENT LOADING ASSURANCE -- if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): -- # Wait for network idle after initial load and images to load -- # await page.wait_for_load_state("networkidle") -- await page.wait_for_load_state("domcontentloaded") -- await asyncio.sleep(0.1) -- from playwright.async_api import TimeoutError as PlaywrightTimeoutError -- try: -- await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) -- # Check for TimeoutError and ignore it -- except PlaywrightTimeoutError: -- pass -- -- # After initial load, adjust viewport to content size -- if not self.text_only and kwargs.get("adjust_viewport_to_content", False): -- try: -- # Get actual page dimensions -- page_width = await page.evaluate("document.documentElement.scrollWidth") -- page_height = await page.evaluate("document.documentElement.scrollHeight") -- -- target_width = self.viewport_width -- target_height = int(target_width * page_width / page_height * 0.95) -- await page.set_viewport_size({"width": target_width, "height": target_height}) -- -- # Compute scale factor -- # We want the entire page visible: the scale should make both width and height fit -- scale = min(target_width / page_width, target_height / page_height) -- -- # Now we call CDP to set metrics. -- # We tell Chrome that the "device" is page_width x page_height in size, -- # but we scale it down so everything fits within the real viewport. -- cdp = await page.context.new_cdp_session(page) -- await cdp.send('Emulation.setDeviceMetricsOverride', { -- 'width': page_width, # full page width -- 'height': page_height, # full page height -- 'deviceScaleFactor': 1, # keep normal DPR -- 'mobile': False, -- 'scale': scale # scale the entire rendered content -- }) -- -- except Exception as e: -- self.logger.warning( -- message="Failed to adjust viewport to content: {error}", -- tag="VIEWPORT", -- params={"error": str(e)} -- ) -- -- # After viewport adjustment, handle page scanning if requested -- if kwargs.get("scan_full_page", False): -- try: -- viewport_height = page.viewport_size.get("height", self.viewport_height) -- current_position = viewport_height # Start with one viewport height -- scroll_delay = kwargs.get("scroll_delay", 0.2) -- -- # Initial scroll -- await page.evaluate(f"window.scrollTo(0, {current_position})") -- await asyncio.sleep(scroll_delay) -- -- # Get height after first scroll to account for any dynamic content -- total_height = await page.evaluate("document.documentElement.scrollHeight") -- -- while current_position < total_height: -- current_position = min(current_position + viewport_height, total_height) -- await page.evaluate(f"window.scrollTo(0, {current_position})") -- await asyncio.sleep(scroll_delay) -- -- # Check for dynamic content -- new_height = await page.evaluate("document.documentElement.scrollHeight") -- if new_height > total_height: -- total_height = new_height -- -- # Scroll back to top -- await page.evaluate("window.scrollTo(0, 0)") -- -- except Exception as e: -- self.logger.warning( -- message="Failed to perform full page scan: {error}", -- tag="PAGE_SCAN", -- params={"error": str(e)} -- ) -- else: -- # Scroll to the bottom of the page -- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") -- -- js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) -- if js_code: -- if isinstance(js_code, str): -- await page.evaluate(js_code) -- elif isinstance(js_code, list): -- for js in js_code: -- await page.evaluate(js) -- -- # await page.wait_for_timeout(100) -- -- # Check for on execution event -- await self.execute_hook('on_execution_started', page, context = context, **kwargs) -- -- if kwargs.get("simulate_user", False) or kwargs.get("magic", False): -- # Simulate user interactions -- await page.mouse.move(100, 100) -- await page.mouse.down() -- await page.mouse.up() -- await page.keyboard.press('ArrowDown') -- -- # Handle the wait_for parameter -- wait_for = kwargs.get("wait_for") -- if wait_for: -- try: -- await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) -- except Exception as e: -- raise RuntimeError(f"Wait condition failed: {str(e)}") -- -- # if not wait_for and js_code: -- # await page.wait_for_load_state('networkidle', timeout=5000) -- -- # Update image dimensions -- if not self.text_only: -- update_image_dimensions_js = """ -- () => { -- return new Promise((resolve) => { -- const filterImage = (img) => { -- // Filter out images that are too small -- if (img.width < 100 && img.height < 100) return false; -- -- // Filter out images that are not visible -- const rect = img.getBoundingClientRect(); -- if (rect.width === 0 || rect.height === 0) return false; -- -- // Filter out images with certain class names (e.g., icons, thumbnails) -- if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; -- -- // Filter out images with certain patterns in their src (e.g., placeholder images) -- if (img.src.includes('placeholder') || img.src.includes('icon')) return false; -- -- return true; -- }; -- -- const images = Array.from(document.querySelectorAll('img')).filter(filterImage); -- let imagesLeft = images.length; -- -- if (imagesLeft === 0) { -- resolve(); -- return; -- } -- -- const checkImage = (img) => { -- if (img.complete && img.naturalWidth !== 0) { -- img.setAttribute('width', img.naturalWidth); -- img.setAttribute('height', img.naturalHeight); -- imagesLeft--; -- if (imagesLeft === 0) resolve(); -- } -- }; -- -- images.forEach(img => { -- checkImage(img); -- if (!img.complete) { -- img.onload = () => { -- checkImage(img); -- }; -- img.onerror = () => { -- imagesLeft--; -- if (imagesLeft === 0) resolve(); -- }; -- } -- }); -- -- // Fallback timeout of 5 seconds -- // setTimeout(() => resolve(), 5000); -- resolve(); -- }); -- } -- """ -- -- try: -- try: -- await page.wait_for_load_state( -- # state="load", -- state="domcontentloaded", -- timeout=5 -- ) -- except PlaywrightTimeoutError: -- pass -- await page.evaluate(update_image_dimensions_js) -- except Exception as e: -- self.logger.error( -- message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", -- tag="ERROR", -- params={"error": str(e)} -- ) -- # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") -- -- # Wait a bit for any onload events to complete -- # await page.wait_for_timeout(100) -- -- # Process iframes -- if kwargs.get("process_iframes", False): -- page = await self.process_iframes(page) -- -- await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) -- # Check if delay_before_return_html is set then wait for that time -- delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) -- if delay_before_return_html: -- await asyncio.sleep(delay_before_return_html) -- -- # Check for remove_overlay_elements parameter -- if kwargs.get("remove_overlay_elements", False): -- await self.remove_overlay_elements(page) -- -- html = await page.content() -- await self.execute_hook('before_return_html', page, html, context = context, **kwargs) -- -- # Check if kwargs has screenshot=True then take screenshot -- screenshot_data = None -- if kwargs.get("screenshot"): -- # Check we have screenshot_wait_for parameter, if we have simply wait for that time -- screenshot_wait_for = kwargs.get("screenshot_wait_for") -- if screenshot_wait_for: -- await asyncio.sleep(screenshot_wait_for) -- screenshot_data = await self.take_screenshot(page) -- -- # if self.verbose: -- # print(f"[LOG] ✅ Crawled {url} successfully!") -- -- if self.use_cached_html: -- cache_file_path = os.path.join( -- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() -- ) -- with open(cache_file_path, "w", encoding="utf-8") as f: -- f.write(html) -- # store response headers and status code in cache -- with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: -- json.dump({ -- "response_headers": response_headers, -- "status_code": status_code -- }, f) -- -- async def get_delayed_content(delay: float = 5.0) -> str: -- if self.verbose: -- print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") -- await asyncio.sleep(delay) -- return await page.content() -- -- response = AsyncCrawlResponse( -- html=html, -- response_headers=response_headers, -- status_code=status_code, -- screenshot=screenshot_data, -- get_delayed_content=get_delayed_content, -- downloaded_files=self._downloaded_files if self._downloaded_files else None -- ) -- return response -- except Error as e: -- raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") -- # finally: -- # if not session_id: -- # await page.close() -- # await context.close() -- -- async def _handle_download(self, download): -- """Handle file downloads.""" -- try: -- suggested_filename = download.suggested_filename -- download_path = os.path.join(self.downloads_path, suggested_filename) -- -- self.logger.info( -- message="Downloading {filename} to {path}", -- tag="FETCH", -- params={"filename": suggested_filename, "path": download_path} -- ) -- -- start_time = time.perf_counter() -- await download.save_as(download_path) -- end_time = time.perf_counter() -- self._downloaded_files.append(download_path) -- -- self.logger.success( -- message="Downloaded {filename} successfully", -- tag="COMPLETE", -- params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} -- ) -- except Exception as e: -- self.logger.error( -- message="Failed to handle download: {error}", -- tag="ERROR", -- params={"error": str(e)} -- ) -- -- # if self.verbose: -- # print(f"[ERROR] Failed to handle download: {str(e)}") -- -- async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: -- semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed -- semaphore = asyncio.Semaphore(semaphore_count) -- -- async def crawl_with_semaphore(url): -- async with semaphore: -- return await self.crawl(url, **kwargs) -- -- tasks = [crawl_with_semaphore(url) for url in urls] -- results = await asyncio.gather(*tasks, return_exceptions=True) -- return [result if not isinstance(result, Exception) else str(result) for result in results] -- -- async def remove_overlay_elements(self, page: Page) -> None: -- """ -- Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. -- -- Args: -- page (Page): The Playwright page instance -- """ -- remove_overlays_js = """ -- async () => { -- // Function to check if element is visible -- const isVisible = (elem) => { -- const style = window.getComputedStyle(elem); -- return style.display !== 'none' && -- style.visibility !== 'hidden' && -- style.opacity !== '0'; -- }; -- -- // Common selectors for popups and overlays -- const commonSelectors = [ -- // Close buttons first -- 'button[class*="close" i]', 'button[class*="dismiss" i]', -- 'button[aria-label*="close" i]', 'button[title*="close" i]', -- 'a[class*="close" i]', 'span[class*="close" i]', -- -- // Cookie notices -- '[class*="cookie-banner" i]', '[id*="cookie-banner" i]', -- '[class*="cookie-consent" i]', '[id*="cookie-consent" i]', -- -- // Newsletter/subscription dialogs -- '[class*="newsletter" i]', '[class*="subscribe" i]', -- -- // Generic popups/modals -- '[class*="popup" i]', '[class*="modal" i]', -- '[class*="overlay" i]', '[class*="dialog" i]', -- '[role="dialog"]', '[role="alertdialog"]' -- ]; -- -- // Try to click close buttons first -- for (const selector of commonSelectors.slice(0, 6)) { -- const closeButtons = document.querySelectorAll(selector); -- for (const button of closeButtons) { -- if (isVisible(button)) { -- try { -- button.click(); -- await new Promise(resolve => setTimeout(resolve, 100)); -- } catch (e) { -- console.log('Error clicking button:', e); -- } -- } -- } -- } -- -- // Remove remaining overlay elements -- const removeOverlays = () => { -- // Find elements with high z-index -- const allElements = document.querySelectorAll('*'); -- for (const elem of allElements) { -- const style = window.getComputedStyle(elem); -- const zIndex = parseInt(style.zIndex); -- const position = style.position; -- -- if ( -- isVisible(elem) && -- (zIndex > 999 || position === 'fixed' || position === 'absolute') && -- ( -- elem.offsetWidth > window.innerWidth * 0.5 || -- elem.offsetHeight > window.innerHeight * 0.5 || -- style.backgroundColor.includes('rgba') || -- parseFloat(style.opacity) < 1 -- ) -- ) { -- elem.remove(); -- } -- } -- -- // Remove elements matching common selectors -- for (const selector of commonSelectors) { -- const elements = document.querySelectorAll(selector); -- elements.forEach(elem => { -- if (isVisible(elem)) { -- elem.remove(); -- } -- }); -- } -- }; -- -- // Remove overlay elements -- removeOverlays(); -- -- // Remove any fixed/sticky position elements at the top/bottom -- const removeFixedElements = () => { -- const elements = document.querySelectorAll('*'); -- elements.forEach(elem => { -- const style = window.getComputedStyle(elem); -- if ( -- (style.position === 'fixed' || style.position === 'sticky') && -- isVisible(elem) -- ) { -- elem.remove(); -- } -- }); -- }; -- -- removeFixedElements(); -- -- // Remove empty block elements as: div, p, span, etc. -- const removeEmptyBlockElements = () => { -- const blockElements = document.querySelectorAll('div, p, span, section, article, header, footer, aside, nav, main, ul, ol, li, dl, dt, dd, h1, h2, h3, h4, h5, h6'); -- blockElements.forEach(elem => { -- if (elem.innerText.trim() === '') { -- elem.remove(); -- } -- }); -- }; -- -- // Remove margin-right and padding-right from body (often added by modal scripts) -- document.body.style.marginRight = '0px'; -- document.body.style.paddingRight = '0px'; -- document.body.style.overflow = 'auto'; -- -- // Wait a bit for any animations to complete -- await new Promise(resolve => setTimeout(resolve, 100)); -- } -- """ -- -- try: -- await page.evaluate(remove_overlays_js) -- await page.wait_for_timeout(500) # Wait for any animations to complete -- except Exception as e: -- self.logger.warning( -- message="Failed to remove overlay elements: {error}", -- tag="SCRAPE", -- params={"error": str(e)} -- ) -- # if self.verbose: -- # print(f"Warning: Failed to remove overlay elements: {str(e)}") -- -- async def take_screenshot(self, page: Page) -> str: -- """ -- Takes a screenshot of the current page. -- -- Args: -- page (Page): The Playwright page instance -- -- Returns: -- str: Base64-encoded screenshot image -- """ -- try: -- # The page is already loaded, just take the screenshot -- screenshot = await page.screenshot(full_page=True) -- return base64.b64encode(screenshot).decode('utf-8') -- except Exception as e: -- error_message = f"Failed to take screenshot: {str(e)}" -- self.logger.error( -- message="Screenshot failed: {error}", -- tag="ERROR", -- params={"error": error_message} -- ) -- -- -- # Generate an error image -- img = Image.new('RGB', (800, 600), color='black') -- draw = ImageDraw.Draw(img) -- font = ImageFont.load_default() -- draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) -- -- buffered = BytesIO() -- img.save(buffered, format="JPEG") -- return base64.b64encode(buffered.getvalue()).decode('utf-8') -- finally: -- await page.close() -- -- async def export_storage_state(self, path: str = None) -> dict: -- """ -- Exports the current storage state (cookies, localStorage, sessionStorage) -- to a JSON file at the specified path. -- """ -- if self.default_context: -- state = await self.default_context.storage_state(path=path) -- self.logger.info( -- message="Exported storage state to {path}", -- tag="INFO", -- params={"path": path} -- ) -- return state -- else: -- self.logger.warning( -- message="No default_context available to export storage state.", -- tag="WARNING" -- ) -- -- async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: -- """ -- Generates a screenshot from raw HTML content. -- -- Args: -- html (str): The HTML content to render and capture. -- -- Returns: -- Optional[str]: Base64-encoded screenshot image or an error image if failed. -- """ -- try: -- if not self.browser: -- await self.start() -- page = await self.browser.new_page() -- await page.set_content(html, wait_until='networkidle') -- screenshot = await page.screenshot(full_page=True) -- await page.close() -- return base64.b64encode(screenshot).decode('utf-8') -- except Exception as e: -- error_message = f"Failed to take screenshot: {str(e)}" -- # print(error_message) -- self.logger.error( -- message="Screenshot failed: {error}", -- tag="ERROR", -- params={"error": error_message} -- ) -- -- # Generate an error image -- img = Image.new('RGB', (800, 600), color='black') -- draw = ImageDraw.Draw(img) -- font = ImageFont.load_default() -- draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) -- -- buffered = BytesIO() -- img.save(buffered, format="JPEG") -- return base64.b64encode(buffered.getvalue()).decode('utf-8') -- -diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py -index 553e9df..3f040e1 100644 ---- a/crawl4ai/async_crawler_strategy.py -+++ b/crawl4ai/async_crawler_strategy.py -@@ -17,9 +17,10 @@ import json - import uuid - from .js_snippet import load_js_script - from .models import AsyncCrawlResponse --from .utils import create_box_message -+from .utils import get_error_context - from .user_agent_generator import UserAgentGenerator --from .config import SCREENSHOT_HEIGHT_TRESHOLD -+from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT -+from .async_configs import BrowserConfig, CrawlerRunConfig - from playwright_stealth import StealthConfig, stealth_async - - -@@ -64,7 +65,6 @@ BROWSER_DISABLE_OPTIONS = [ - "--use-mock-keychain" - ] - -- - class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): - self.browser_type = browser_type -@@ -225,50 +225,44 @@ class ManagedBrowser: - params={"error": str(e)} - ) - -- - class BrowserManager: -- def __init__(self, use_managed_browser: bool, user_data_dir: Optional[str], headless: bool, logger, browser_type: str, proxy, proxy_config, chrome_channel: str, viewport_width: int, viewport_height: int, accept_downloads: bool, storage_state, ignore_https_errors: bool, java_script_enabled: bool, cookies: List[dict], headers: dict, extra_args: List[str], text_only: bool, light_mode: bool, user_agent: str, browser_hint: str, downloads_path: Optional[str]): -- self.use_managed_browser = use_managed_browser -- self.user_data_dir = user_data_dir -- self.headless = headless -+ def __init__(self, browser_config: BrowserConfig, logger=None): -+ """ -+ Initialize the BrowserManager with a browser configuration. -+ -+ Args: -+ browser_config (BrowserConfig): Configuration object containing all browser settings -+ logger: Logger instance for recording events and errors -+ """ -+ self.config = browser_config - self.logger = logger -- self.browser_type = browser_type -- self.proxy = proxy -- self.proxy_config = proxy_config -- self.chrome_channel = chrome_channel -- self.viewport_width = viewport_width -- self.viewport_height = viewport_height -- self.accept_downloads = accept_downloads -- self.storage_state = storage_state -- self.ignore_https_errors = ignore_https_errors -- self.java_script_enabled = java_script_enabled -- self.cookies = cookies or [] -- self.headers = headers or {} -- self.extra_args = extra_args or [] -- self.text_only = text_only -- self.light_mode = light_mode -+ -+ # Browser state - self.browser = None -- self.default_context : BrowserContext = None -+ self.default_context = None - self.managed_browser = None -- self.sessions = {} -- self.session_ttl = 1800 - self.playwright = None -- self.user_agent = user_agent -- self.browser_hint = browser_hint -- self.downloads_path = downloads_path -+ -+ # Session management -+ self.sessions = {} -+ self.session_ttl = 1800 # 30 minutes -+ -+ # Initialize ManagedBrowser if needed -+ if self.config.use_managed_browser: -+ self.managed_browser = ManagedBrowser( -+ browser_type=self.config.browser_type, -+ user_data_dir=self.config.user_data_dir, -+ headless=self.config.headless, -+ logger=self.logger -+ ) - - async def start(self): -+ """Start the browser instance and set up the default context.""" - if self.playwright is None: - from playwright.async_api import async_playwright - self.playwright = await async_playwright().start() - -- if self.use_managed_browser: -- self.managed_browser = ManagedBrowser( -- browser_type=self.browser_type, -- user_data_dir=self.user_data_dir, -- headless=self.headless, -- logger=self.logger -- ) -+ if self.config.use_managed_browser: - cdp_url = await self.managed_browser.start() - self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) - contexts = self.browser.contexts -@@ -276,142 +270,126 @@ class BrowserManager: - self.default_context = contexts[0] - else: - self.default_context = await self.browser.new_context( -- viewport={"width": self.viewport_width, "height": self.viewport_height}, -- storage_state=self.storage_state, -- user_agent=self.headers.get("User-Agent"), -- accept_downloads=self.accept_downloads, -- ignore_https_errors=self.ignore_https_errors, -- java_script_enabled=self.java_script_enabled -+ viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, -+ storage_state=self.config.storage_state, -+ user_agent=self.config.headers.get("User-Agent", self.config.user_agent), -+ accept_downloads=self.config.accept_downloads, -+ ignore_https_errors=self.config.ignore_https_errors, -+ java_script_enabled=self.config.java_script_enabled - ) - await self.setup_context(self.default_context) - else: -- browser_args = { -- "headless": self.headless, -- "args": [ -- "--no-sandbox", -- "--disable-dev-shm-usage", -- "--no-first-run", -- "--no-default-browser-check", -- "--disable-infobars", -- "--window-position=0,0", -- "--ignore-certificate-errors", -- "--ignore-certificate-errors-spki-list", -- "--disable-blink-features=AutomationControlled", -- "--window-position=400,0", -- f"--window-size={self.viewport_width},{self.viewport_height}", -- ] -- } -- -- if self.light_mode: -- browser_args["args"].extend(BROWSER_DISABLE_OPTIONS) -+ browser_args = self._build_browser_args() -+ -+ # Launch appropriate browser type -+ if self.config.browser_type == "firefox": -+ self.browser = await self.playwright.firefox.launch(**browser_args) -+ elif self.config.browser_type == "webkit": -+ self.browser = await self.playwright.webkit.launch(**browser_args) -+ else: -+ self.browser = await self.playwright.chromium.launch(**browser_args) - -- if self.text_only: -- browser_args["args"].extend(['--blink-settings=imagesEnabled=false','--disable-remote-fonts']) -+ self.default_context = self.browser - -- if self.chrome_channel: -- browser_args["channel"] = self.chrome_channel -+ def _build_browser_args(self) -> dict: -+ """Build browser launch arguments from config.""" -+ args = [ -+ "--no-sandbox", -+ "--disable-dev-shm-usage", -+ "--no-first-run", -+ "--no-default-browser-check", -+ "--disable-infobars", -+ "--window-position=0,0", -+ "--ignore-certificate-errors", -+ "--ignore-certificate-errors-spki-list", -+ "--disable-blink-features=AutomationControlled", -+ "--window-position=400,0", -+ f"--window-size={self.config.viewport_width},{self.config.viewport_height}", -+ ] - -- if self.extra_args: -- browser_args["args"].extend(self.extra_args) -+ if self.config.light_mode: -+ args.extend(BROWSER_DISABLE_OPTIONS) - -- if self.accept_downloads: -- browser_args["downloads_path"] = os.path.join(os.getcwd(), "downloads") -- os.makedirs(browser_args["downloads_path"], exist_ok=True) -+ if self.config.text_only: -+ args.extend(['--blink-settings=imagesEnabled=false', '--disable-remote-fonts']) - -- if self.proxy: -- from playwright.async_api import ProxySettings -- proxy_settings = ProxySettings(server=self.proxy) -- browser_args["proxy"] = proxy_settings -- elif self.proxy_config: -- from playwright.async_api import ProxySettings -- proxy_settings = ProxySettings( -- server=self.proxy_config.get("server"), -- username=self.proxy_config.get("username"), -- password=self.proxy_config.get("password") -- ) -- browser_args["proxy"] = proxy_settings -+ if self.config.extra_args: -+ args.extend(self.config.extra_args) - -- if self.browser_type == "firefox": -- self.browser = await self.playwright.firefox.launch(**browser_args) -- elif self.browser_type == "webkit": -- self.browser = await self.playwright.webkit.launch(**browser_args) -- else: -- self.browser = await self.playwright.chromium.launch(**browser_args) -+ browser_args = { -+ "headless": self.config.headless, -+ "args": args -+ } - -- self.default_context = self.browser -- # Since default_context in non-managed mode is the browser, no setup needed here. -+ if self.config.chrome_channel: -+ browser_args["channel"] = self.config.chrome_channel -+ -+ if self.config.accept_downloads: -+ browser_args["downloads_path"] = (self.config.downloads_path or -+ os.path.join(os.getcwd(), "downloads")) -+ os.makedirs(browser_args["downloads_path"], exist_ok=True) -+ -+ if self.config.proxy or self.config.proxy_config: -+ from playwright.async_api import ProxySettings -+ proxy_settings = ( -+ ProxySettings(server=self.config.proxy) if self.config.proxy else -+ ProxySettings( -+ server=self.config.proxy_config.get("server"), -+ username=self.config.proxy_config.get("username"), -+ password=self.config.proxy_config.get("password") -+ ) -+ ) -+ browser_args["proxy"] = proxy_settings - -+ return browser_args - -- async def setup_context(self, context : BrowserContext, is_default=False): -- # Set extra headers -- if self.headers: -- await context.set_extra_http_headers(self.headers) -+ async def setup_context(self, context: BrowserContext, is_default=False): -+ """Set up a browser context with the configured options.""" -+ if self.config.headers: -+ await context.set_extra_http_headers(self.config.headers) - -- # Add cookies if any -- if self.cookies: -- await context.add_cookies(self.cookies) -+ if self.config.cookies: -+ await context.add_cookies(self.config.cookies) - -- # Ensure storage_state if provided -- if self.storage_state: -- # If storage_state is a dictionary or file path, Playwright will handle it. -+ if self.config.storage_state: - await context.storage_state(path=None) - -- # If accept_downloads, set timeouts and ensure properties -- if self.accept_downloads: -- await context.set_default_timeout(60000) -- await context.set_default_navigation_timeout(60000) -- if self.downloads_path: -+ if self.config.accept_downloads: -+ context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT) -+ context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) -+ if self.config.downloads_path: - context._impl_obj._options["accept_downloads"] = True -- context._impl_obj._options["downloads_path"] = self.downloads_path -+ context._impl_obj._options["downloads_path"] = self.config.downloads_path - -- # If we have a user_agent, override it along with sec-ch-ua -- if self.user_agent: -- # Merge headers if needed -- combined_headers = {"User-Agent": self.user_agent, "sec-ch-ua": self.browser_hint} -- combined_headers.update(self.headers) -+ # Handle user agent and browser hints -+ if self.config.user_agent: -+ combined_headers = { -+ "User-Agent": self.config.user_agent, -+ "sec-ch-ua": self.config.browser_hint -+ } -+ combined_headers.update(self.config.headers) - await context.set_extra_http_headers(combined_headers) -- -- async def close(self): -- # Close all active sessions -- session_ids = list(self.sessions.keys()) -- for session_id in session_ids: -- await self.kill_session(session_id) -- -- if self.browser: -- await self.browser.close() -- self.browser = None -- -- if self.managed_browser: -- await asyncio.sleep(0.5) -- await self.managed_browser.cleanup() -- self.managed_browser = None -- -- if self.playwright: -- await self.playwright.stop() -- self.playwright = None - - async def get_page(self, session_id: Optional[str], user_agent: str): -- # Cleanup expired sessions -+ """Get a page for the given session ID, creating a new one if needed.""" - self._cleanup_expired_sessions() - -- if session_id: -- context, page, _ = self.sessions.get(session_id, (None, None, None)) -- if context and page: -- self.sessions[session_id] = (context, page, time.time()) -- return page, context -+ if session_id and session_id in self.sessions: -+ context, page, _ = self.sessions[session_id] -+ self.sessions[session_id] = (context, page, time.time()) -+ return page, context - -- # Create a new context/page pair -- if self.use_managed_browser: -+ if self.config.use_managed_browser: - context = self.default_context - page = await context.new_page() - else: - context = await self.browser.new_context( - user_agent=user_agent, -- viewport={"width": self.viewport_width, "height": self.viewport_height}, -- proxy={"server": self.proxy} if self.proxy else None, -- accept_downloads=self.accept_downloads, -- storage_state=self.storage_state, -- ignore_https_errors=self.ignore_https_errors -+ viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, -+ proxy={"server": self.config.proxy} if self.config.proxy else None, -+ accept_downloads=self.config.accept_downloads, -+ storage_state=self.config.storage_state, -+ ignore_https_errors=self.config.ignore_https_errors - ) - await self.setup_context(context) - page = await context.new_page() -@@ -422,14 +400,16 @@ class BrowserManager: - return page, context - - async def kill_session(self, session_id: str): -+ """Kill a browser session and clean up resources.""" - if session_id in self.sessions: - context, page, _ = self.sessions[session_id] - await page.close() -- if not self.use_managed_browser: -+ if not self.config.use_managed_browser: - await context.close() - del self.sessions[session_id] - - def _cleanup_expired_sessions(self): -+ """Clean up expired sessions based on TTL.""" - current_time = time.time() - expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() -@@ -438,6 +418,28 @@ class BrowserManager: - for sid in expired_sessions: - asyncio.create_task(self.kill_session(sid)) - -+ async def close(self): -+ """Close all browser resources and clean up.""" -+ if self.config.sleep_on_close: -+ await asyncio.sleep(0.5) -+ -+ session_ids = list(self.sessions.keys()) -+ for session_id in session_ids: -+ await self.kill_session(session_id) -+ -+ if self.browser: -+ await self.browser.close() -+ self.browser = None -+ -+ if self.managed_browser: -+ await asyncio.sleep(0.5) -+ await self.managed_browser.cleanup() -+ self.managed_browser = None -+ -+ if self.playwright: -+ await self.playwright.stop() -+ self.playwright = None -+ - class AsyncCrawlerStrategy(ABC): - @abstractmethod - async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: -@@ -460,60 +462,24 @@ class AsyncCrawlerStrategy(ABC): - pass - - class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): -- def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): -- self.text_only = kwargs.get("text_only", False) -- self.light_mode = kwargs.get("light_mode", False) -+ def __init__(self, browser_config: BrowserConfig = None, logger = None, **kwargs): -+ """ -+ Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. -+ -+ Args: -+ browser_config (BrowserConfig): Configuration object containing browser settings. -+ If None, will be created from kwargs for backwards compatibility. -+ logger: Logger instance for recording events and errors. -+ **kwargs: Additional arguments for backwards compatibility and extending functionality. -+ """ -+ # Initialize browser config, either from provided object or kwargs -+ self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs) - self.logger = logger -- self.use_cached_html = use_cached_html -- self.viewport_width = kwargs.get("viewport_width", 800 if self.text_only else 1920) -- self.viewport_height = kwargs.get("viewport_height", 600 if self.text_only else 1080) - -- if self.text_only: -- self.extra_args = kwargs.get("extra_args", []) + [ -- '--disable-images', -- '--disable-javascript', -- '--disable-gpu', -- '--disable-software-rasterizer', -- '--disable-dev-shm-usage' -- ] -- -- self.user_agent = kwargs.get( -- "user_agent", -- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" -- # "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" -- ) -- user_agenr_generator = UserAgentGenerator() -- if kwargs.get("user_agent_mode") == "random": -- self.user_agent = user_agenr_generator.generate( -- **kwargs.get("user_agent_generator_config", {}) -- ) -- self.pdf = kwargs.get("pdf", False) # New flag -- self.screenshot_requested = kwargs.get('screenshot', False) -+ # Initialize session management -+ self._downloaded_files = [] - -- self.proxy = kwargs.get("proxy") -- self.proxy_config = kwargs.get("proxy_config") -- self.headless = kwargs.get("headless", True) -- self.browser_type = kwargs.get("browser_type", "chromium") -- self.headers = kwargs.get("headers", {}) -- self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) -- self.headers.setdefault("sec-ch-ua", self.browser_hint) -- self.cookies = kwargs.get("cookies", []) -- self.storage_state = kwargs.get("storage_state", None) -- self.sessions = {} -- self.session_ttl = 1800 -- self.js_code = js_code -- self.verbose = kwargs.get("verbose", False) -- self.playwright = None -- self.browser = None -- self.sleep_on_close = kwargs.get("sleep_on_close", False) -- self.use_managed_browser = kwargs.get("use_managed_browser", False) -- self.user_data_dir = kwargs.get("user_data_dir", None) -- self.use_persistent_context = kwargs.get("use_persistent_context", False) -- if self.use_persistent_context: -- self.use_managed_browser = True -- self.chrome_channel = kwargs.get("chrome_channel", "chrome") -- self.managed_browser = None -- self.default_context = None -+ # Initialize hooks system - self.hooks = { - 'on_browser_created': None, - 'on_user_agent_updated': None, -@@ -523,40 +489,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - 'before_return_html': None, - 'before_retrieve_html': None - } -- self.extra_args = kwargs.get("extra_args", []) -- self.ignore_https_errors = kwargs.get("ignore_https_errors", True) -- self.java_script_enabled = kwargs.get("java_script_enabled", True) -- self.accept_downloads = kwargs.get("accept_downloads", False) -- self.downloads_path = kwargs.get("downloads_path") -- self._downloaded_files = [] # Track downloaded files for current crawl -- if self.accept_downloads and not self.downloads_path: -- self.downloads_path = os.path.join(os.getcwd(), "downloads") -- os.makedirs(self.downloads_path, exist_ok=True) -- -+ -+ # Initialize browser manager with config - self.browser_manager = BrowserManager( -- use_managed_browser=self.use_managed_browser, -- user_data_dir=self.user_data_dir, -- headless=self.headless, -- logger=self.logger, -- browser_type=self.browser_type, -- proxy=self.proxy, -- proxy_config=self.proxy_config, -- chrome_channel=self.chrome_channel, -- viewport_width=self.viewport_width, -- viewport_height=self.viewport_height, -- accept_downloads=self.accept_downloads, -- storage_state=self.storage_state, -- ignore_https_errors=self.ignore_https_errors, -- java_script_enabled=self.java_script_enabled, -- cookies=self.cookies, -- headers=self.headers, -- extra_args=self.extra_args, -- text_only=self.text_only, -- light_mode=self.light_mode, -- user_agent=self.user_agent, -- browser_hint=self.browser_hint, -- downloads_path=self.downloads_path -- ) -+ browser_config=self.browser_config, -+ logger=self.logger -+ ) - - async def __aenter__(self): - await self.start() -@@ -570,15 +508,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - await self.execute_hook('on_browser_created', self.browser_manager.browser, context = self.browser_manager.default_context) - - async def close(self): -- if self.sleep_on_close: -- await asyncio.sleep(0.5) -- - await self.browser_manager.close() -- -- # Issue #256: Remove __del__ method to avoid potential issues with async cleanup -- # def __del__(self): -- # if self.browser or self.playwright: -- # asyncio.get_event_loop().run_until_complete(self.close()) -+ -+ async def kill_session(self, session_id: str): -+ # Log a warning message and no need kill session, in new version auto kill session -+ self.logger.warning( -+ message="Session auto-kill is enabled in the new version. No need to manually kill sessions.", -+ tag="WARNING" -+ ) -+ await self.browser_manager.kill_session(session_id) - - def set_hook(self, hook_type: str, hook: Callable): - if hook_type in self.hooks: -@@ -600,23 +538,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - - def set_custom_headers(self, headers: Dict[str, str]): - self.headers = headers -- -- async def kill_session(self, session_id: str): -- if session_id in self.sessions: -- context, page, _ = self.sessions[session_id] -- await page.close() -- if not self.use_managed_browser: -- await context.close() -- del self.sessions[session_id] -- -- def _cleanup_expired_sessions(self): -- current_time = time.time() -- expired_sessions = [ -- sid for sid, (_, _, last_used) in self.sessions.items() -- if current_time - last_used > self.session_ttl -- ] -- for sid in expired_sessions: -- asyncio.create_task(self.kill_session(sid)) - - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): - wait_for = wait_for.strip() -@@ -715,7 +636,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - }} - """) - else: -- # print(f"Warning: Could not access content frame for iframe {i}") - self.logger.warning( - message="Could not access content frame for iframe {index}", - tag="SCRAPE", -@@ -727,7 +647,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - tag="ERROR", - params={"index": i, "error": str(e)} - ) -- # print(f"Error processing iframe {i}: {str(e)}") - - # Return the page object - return page -@@ -743,7 +662,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - page, context = await self.browser_manager.get_page(session_id, user_agent) - return session_id - -- async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: -+ async def crawl(self, url: str, config: CrawlerRunConfig, **kwargs) -> AsyncCrawlResponse: - """ - Crawls a given URL or processes raw HTML/local file content based on the URL prefix. - -@@ -759,15 +678,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - Returns: - AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. - """ -+ config = config or CrawlerRunConfig.from_kwargs(kwargs) - response_headers = {} -- status_code = 200 # Default to 200 for local/raw HTML -- screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) -- pdf_requested = kwargs.get("pdf", self.pdf) -+ status_code = 200 # Default for local/raw HTML - screenshot_data = None - - if url.startswith(('http://', 'https://')): -- # Proceed with standard web crawling -- return await self._crawl_web(url, **kwargs) -+ return await self._crawl_web(url, config) - - elif url.startswith('file://'): - # Process local file -@@ -776,7 +693,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - raise FileNotFoundError(f"Local file not found: {local_file_path}") - with open(local_file_path, 'r', encoding='utf-8') as f: - html = f.read() -- if screenshot_requested: -+ if config.screenshot: - screenshot_data = await self._generate_screenshot_from_html(html) - return AsyncCrawlResponse( - html=html, -@@ -790,7 +707,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - # Process raw HTML content - raw_html = url[4:] # Remove 'raw:' prefix - html = raw_html -- if screenshot_requested: -+ if config.screenshot: - screenshot_data = await self._generate_screenshot_from_html(html) - return AsyncCrawlResponse( - html=html, -@@ -802,92 +719,85 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - else: - raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") - -- async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: -+ async def _crawl_web(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse: -+ """ -+ Internal method to crawl web URLs with the specified configuration. -+ -+ Args: -+ url (str): The web URL to crawl -+ config (CrawlerRunConfig): Configuration object controlling the crawl behavior -+ -+ Returns: -+ AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data -+ """ - response_headers = {} - status_code = None - -- screenshot_requested = kwargs.get("screenshot", self.screenshot_requested) -- pdf_requested = kwargs.get("pdf", self.pdf) -- - # Reset downloaded files list for new crawl - self._downloaded_files = [] - -- self._cleanup_expired_sessions() -- session_id = kwargs.get("session_id") -- -- # Check if in kwargs we have user_agent that will override the default user_agent -- user_agent = kwargs.get("user_agent", self.user_agent) -- -- # Generate random user agent if magic mode is enabled and user_agent_mode is not random -- if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): -+ # Handle user agent with magic mode -+ user_agent = self.browser_config.user_agent -+ if config.magic and self.browser_config.user_agent_mode != "random": - user_agent = UserAgentGenerator().generate( -- **kwargs.get("user_agent_generator_config", {}) -+ **(self.browser_config.user_agent_generator_config or {}) - ) - -- # Handle page creation differently for managed browser -- page, context = await self.browser_manager.get_page(session_id, user_agent) -+ # Get page for session -+ page, context = await self.browser_manager.get_page( -+ session_id=config.session_id, -+ user_agent=user_agent -+ ) -+ -+ # Add default cookie - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - -- if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): -- # Inject scripts to override navigator properties -+ # Handle navigator overrides -+ if config.override_navigator or config.simulate_user or config.magic: - await context.add_init_script(load_js_script("navigator_overrider")) - -- # Add console message and error logging -- if kwargs.get("log_console", False): -- page.on("console", lambda msg: print(f"Console: {msg.text}")) -- page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) -+ # Set up console logging if requested -+ if config.log_console: -+ page.on("console", lambda msg: self.logger.debug( -+ message="Console: {msg}", -+ tag="CONSOLE", -+ params={"msg": msg.text} -+ )) -+ page.on("pageerror", lambda exc: self.logger.error( -+ message="Page error: {exc}", -+ tag="ERROR", -+ params={"exc": exc} -+ )) - - try: -- # Set up download handling if enabled -- if self.accept_downloads: -+ # Set up download handling -+ if self.browser_config.accept_downloads: - page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) - -- if self.use_cached_html: -- cache_file_path = os.path.join( -- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() -- ) -- if os.path.exists(cache_file_path): -- html = "" -- with open(cache_file_path, "r") as f: -- html = f.read() -- # retrieve response headers and status code from cache -- with open(cache_file_path + ".meta", "r") as f: -- meta = json.load(f) -- response_headers = meta.get("response_headers", {}) -- status_code = meta.get("status_code") -- response = AsyncCrawlResponse( -- html=html, response_headers=response_headers, status_code=status_code -- ) -- return response -- -- if not kwargs.get("js_only", False): -- await self.execute_hook('before_goto', page, context = context, **kwargs) -+ # Handle page navigation and content loading -+ if not config.js_only: -+ await self.execute_hook('before_goto', page, context=context) - - try: - response = await page.goto( - url, -- # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), -- wait_until=kwargs.get("wait_until", "domcontentloaded"), -- timeout=kwargs.get("page_timeout", 60000), -+ wait_until=config.wait_until, -+ timeout=config.page_timeout - ) - except Error as e: -- raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") -+ raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") - -- await self.execute_hook('after_goto', page, context = context, **kwargs) -+ await self.execute_hook('after_goto', page, context=context) - -- # Get status code and headers - status_code = response.status - response_headers = response.headers - else: - status_code = 200 - response_headers = {} - -- # Replace the current wait_for_selector line with this more robust check: -+ # Wait for body element and visibility - try: -- # First wait for body to exist, regardless of visibility - await page.wait_for_selector('body', state='attached', timeout=30000) -- -- # Then wait for it to become visible by checking CSS - await page.wait_for_function(""" - () => { - const body = document.body; -@@ -897,9 +807,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - style.opacity !== '0'; - } - """, timeout=30000) -- - except Error as e: -- # If waiting fails, let's try to diagnose the issue - visibility_info = await page.evaluate(""" - () => { - const body = document.body; -@@ -914,233 +822,195 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - } - """) - -- if self.verbose: -- print(f"Body visibility debug info: {visibility_info}") -+ if self.config.verbose: -+ self.logger.debug( -+ message="Body visibility info: {info}", -+ tag="DEBUG", -+ params={"info": visibility_info} -+ ) - -- # Even if body is hidden, we might still want to proceed -- if kwargs.get('ignore_body_visibility', True): -- if self.verbose: -- print("Proceeding despite hidden body...") -- pass -- else: -+ if not config.ignore_body_visibility: - raise Error(f"Body element is hidden: {visibility_info}") -- -- # CONTENT LOADING ASSURANCE -- if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): -- # Wait for network idle after initial load and images to load -- # await page.wait_for_load_state("networkidle") -+ -+ # Handle content loading and viewport adjustment -+ if not self.browser_config.text_only and (config.wait_for_images or config.adjust_viewport_to_content): - await page.wait_for_load_state("domcontentloaded") - await asyncio.sleep(0.1) -- from playwright.async_api import TimeoutError as PlaywrightTimeoutError - try: -- await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) -- # Check for TimeoutError and ignore it -+ await page.wait_for_function( -+ "Array.from(document.images).every(img => img.complete)", -+ timeout=1000 -+ ) - except PlaywrightTimeoutError: - pass -- -- # After initial load, adjust viewport to content size -- if not self.text_only and kwargs.get("adjust_viewport_to_content", False): -- try: -- # Get actual page dimensions -+ -+ # Adjust viewport if needed -+ if not self.browser_config.text_only and config.adjust_viewport_to_content: -+ try: - page_width = await page.evaluate("document.documentElement.scrollWidth") - page_height = await page.evaluate("document.documentElement.scrollHeight") - -- target_width = self.viewport_width -+ target_width = self.browser_config.viewport_width - target_height = int(target_width * page_width / page_height * 0.95) - await page.set_viewport_size({"width": target_width, "height": target_height}) - -- # Compute scale factor -- # We want the entire page visible: the scale should make both width and height fit - scale = min(target_width / page_width, target_height / page_height) -- -- # Now we call CDP to set metrics. -- # We tell Chrome that the "device" is page_width x page_height in size, -- # but we scale it down so everything fits within the real viewport. - cdp = await page.context.new_cdp_session(page) - await cdp.send('Emulation.setDeviceMetricsOverride', { -- 'width': page_width, # full page width -- 'height': page_height, # full page height -- 'deviceScaleFactor': 1, # keep normal DPR -+ 'width': page_width, -+ 'height': page_height, -+ 'deviceScaleFactor': 1, - 'mobile': False, -- 'scale': scale # scale the entire rendered content -+ 'scale': scale - }) -- - except Exception as e: - self.logger.warning( - message="Failed to adjust viewport to content: {error}", - tag="VIEWPORT", - params={"error": str(e)} -- ) -- -- # After viewport adjustment, handle page scanning if requested -- if kwargs.get("scan_full_page", False): -- try: -- viewport_height = page.viewport_size.get("height", self.viewport_height) -- current_position = viewport_height # Start with one viewport height -- scroll_delay = kwargs.get("scroll_delay", 0.2) -- -- # Initial scroll -- await page.evaluate(f"window.scrollTo(0, {current_position})") -- await asyncio.sleep(scroll_delay) -- -- # Get height after first scroll to account for any dynamic content -- total_height = await page.evaluate("document.documentElement.scrollHeight") -- -- while current_position < total_height: -- current_position = min(current_position + viewport_height, total_height) -- await page.evaluate(f"window.scrollTo(0, {current_position})") -- await asyncio.sleep(scroll_delay) -- -- # Check for dynamic content -- new_height = await page.evaluate("document.documentElement.scrollHeight") -- if new_height > total_height: -- total_height = new_height -- -- # Scroll back to top -- await page.evaluate("window.scrollTo(0, 0)") -- -- except Exception as e: -- self.logger.warning( -- message="Failed to perform full page scan: {error}", -- tag="PAGE_SCAN", -- params={"error": str(e)} - ) -- else: -- # Scroll to the bottom of the page -- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") -- -- js_code = kwargs.get("js_code", kwargs.get("js", self.js_code)) -- if js_code: -- if isinstance(js_code, str): -- await page.evaluate(js_code) -- elif isinstance(js_code, list): -- for js in js_code: -+ -+ # Handle full page scanning -+ if config.scan_full_page: -+ await self._handle_full_page_scan(page, config.scroll_delay) -+ -+ # Execute JavaScript if provided -+ if config.js_code: -+ if isinstance(config.js_code, str): -+ await page.evaluate(config.js_code) -+ elif isinstance(config.js_code, list): -+ for js in config.js_code: - await page.evaluate(js) - -- # await page.wait_for_timeout(100) -- -- # Check for on execution event -- await self.execute_hook('on_execution_started', page, context = context, **kwargs) -- -- if kwargs.get("simulate_user", False) or kwargs.get("magic", False): -- # Simulate user interactions -+ await self.execute_hook('on_execution_started', page, context=context) -+ -+ # Handle user simulation -+ if config.simulate_user or config.magic: - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press('ArrowDown') - -- # Handle the wait_for parameter -- wait_for = kwargs.get("wait_for") -- if wait_for: -+ # Handle wait_for condition -+ if config.wait_for: - try: -- await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) -+ await self.smart_wait(page, config.wait_for, timeout=config.page_timeout) - except Exception as e: - raise RuntimeError(f"Wait condition failed: {str(e)}") -- -- # if not wait_for and js_code: -- # await page.wait_for_load_state('networkidle', timeout=5000) - -- # Update image dimensions -- if not self.text_only: -+ # Update image dimensions if needed -+ if not self.browser_config.text_only: - update_image_dimensions_js = load_js_script("update_image_dimensions") -- - try: - try: -- await page.wait_for_load_state( -- # state="load", -- state="domcontentloaded", -- timeout=5 -- ) -+ await page.wait_for_load_state("domcontentloaded", timeout=5) - except PlaywrightTimeoutError: - pass - await page.evaluate(update_image_dimensions_js) - except Exception as e: - self.logger.error( -- message="Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {error}", -+ message="Error updating image dimensions: {error}", - tag="ERROR", - params={"error": str(e)} - ) -- # raise RuntimeError(f"Error updating image dimensions ACS-UPDATE_IMAGE_DIMENSIONS_JS: {str(e)}") - -- # Wait a bit for any onload events to complete -- # await page.wait_for_timeout(100) -- -- # Process iframes -- if kwargs.get("process_iframes", False): -+ # Process iframes if needed -+ if config.process_iframes: - page = await self.process_iframes(page) -- -- await self.execute_hook('before_retrieve_html', page, context = context, **kwargs) -- # Check if delay_before_return_html is set then wait for that time -- delay_before_return_html = kwargs.get("delay_before_return_html", 0.1) -- if delay_before_return_html: -- await asyncio.sleep(delay_before_return_html) -- -- # Check for remove_overlay_elements parameter -- if kwargs.get("remove_overlay_elements", False): -+ -+ # Pre-content retrieval hooks and delay -+ await self.execute_hook('before_retrieve_html', page, context=context) -+ if config.delay_before_return_html: -+ await asyncio.sleep(config.delay_before_return_html) -+ -+ # Handle overlay removal -+ if config.remove_overlay_elements: - await self.remove_overlay_elements(page) -- -+ -+ # Get final HTML content - html = await page.content() -- await self.execute_hook('before_return_html', page, html, context = context, **kwargs) -- -+ await self.execute_hook('before_return_html', page, html, context=context) -+ -+ # Handle PDF and screenshot generation - start_export_time = time.perf_counter() - pdf_data = None -- if pdf_requested: -- # Generate PDF once -- pdf_data = await self.export_pdf(page) -- -- # Check if kwargs has screenshot=True then take screenshot - screenshot_data = None -- if screenshot_requested: #kwargs.get("screenshot"): -- # Check we have screenshot_wait_for parameter, if we have simply wait for that time -- screenshot_wait_for = kwargs.get("screenshot_wait_for") -- if screenshot_wait_for: -- await asyncio.sleep(screenshot_wait_for) -- -- screenshot_data = await self.take_screenshot(page, **kwargs) -- end_export_time = time.perf_counter() -+ -+ if config.pdf: -+ pdf_data = await self.export_pdf(page) -+ -+ if config.screenshot: -+ if config.screenshot_wait_for: -+ await asyncio.sleep(config.screenshot_wait_for) -+ screenshot_data = await self.take_screenshot( -+ page, -+ screenshot_height_threshold=config.screenshot_height_threshold -+ ) -+ - if screenshot_data or pdf_data: - self.logger.info( - message="Exporting PDF and taking screenshot took {duration:.2f}s", - tag="EXPORT", -- params={"duration": end_export_time - start_export_time} -+ params={"duration": time.perf_counter() - start_export_time} - ) -- -- if self.use_cached_html: -- cache_file_path = os.path.join( -- os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() -- ) -- with open(cache_file_path, "w", encoding="utf-8") as f: -- f.write(html) -- # store response headers and status code in cache -- with open(cache_file_path + ".meta", "w", encoding="utf-8") as f: -- json.dump({ -- "response_headers": response_headers, -- "status_code": status_code -- }, f) - -+ # Define delayed content getter - async def get_delayed_content(delay: float = 5.0) -> str: -- if self.verbose: -- print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") -+ if self.config.verbose: -+ self.logger.info( -+ message="Waiting for {delay} seconds before retrieving content for {url}", -+ tag="INFO", -+ params={"delay": delay, "url": url} -+ ) - await asyncio.sleep(delay) - return await page.content() -- -- response = AsyncCrawlResponse( -- html=html, -- response_headers=response_headers, -+ -+ # Return complete response -+ return AsyncCrawlResponse( -+ html=html, -+ response_headers=response_headers, - status_code=status_code, - screenshot=screenshot_data, - pdf_data=pdf_data, - get_delayed_content=get_delayed_content, - downloaded_files=self._downloaded_files if self._downloaded_files else None - ) -- return response -- except Error as e: -- raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") -- # finally: -- # if not session_id: -- # await page.close() -- # await context.close() - -+ except Exception as e: -+ raise e -+ -+ async def _handle_full_page_scan(self, page: Page, scroll_delay: float): -+ """Helper method to handle full page scanning""" -+ try: -+ viewport_height = page.viewport_size.get("height", self.browser_config.viewport_height) -+ current_position = viewport_height -+ -+ await page.evaluate(f"window.scrollTo(0, {current_position})") -+ await asyncio.sleep(scroll_delay) -+ -+ total_height = await page.evaluate("document.documentElement.scrollHeight") -+ -+ while current_position < total_height: -+ current_position = min(current_position + viewport_height, total_height) -+ await page.evaluate(f"window.scrollTo(0, {current_position})") -+ await asyncio.sleep(scroll_delay) -+ -+ new_height = await page.evaluate("document.documentElement.scrollHeight") -+ if new_height > total_height: -+ total_height = new_height -+ -+ await page.evaluate("window.scrollTo(0, 0)") -+ -+ except Exception as e: -+ self.logger.warning( -+ message="Failed to perform full page scan: {error}", -+ tag="PAGE_SCAN", -+ params={"error": str(e)} -+ ) -+ else: -+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") -+ -+ - async def _handle_download(self, download): - """Handle file downloads.""" - try: -@@ -1170,8 +1040,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - params={"error": str(e)} - ) - -- # if self.verbose: -- # print(f"[ERROR] Failed to handle download: {str(e)}") - - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed -@@ -1192,7 +1060,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - Args: - page (Page): The Playwright page instance - """ -- remove_overlays_js = load_js_script("remove_overlays") -+ remove_overlays_js = load_js_script("remove_overlay_elements") - - try: - await page.evaluate(remove_overlays_js) -@@ -1203,8 +1071,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - tag="SCRAPE", - params={"error": str(e)} - ) -- # if self.verbose: -- # print(f"Warning: Failed to remove overlay elements: {str(e)}") - - async def export_pdf(self, page: Page) -> bytes: - """ -@@ -1386,7 +1252,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - return base64.b64encode(screenshot).decode('utf-8') - except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" -- # print(error_message) - self.logger.error( - message="Screenshot failed: {error}", - tag="ERROR", -diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py -index 3c97e7d..5cdafac 100644 ---- a/crawl4ai/async_database.py -+++ b/crawl4ai/async_database.py -@@ -1,4 +1,4 @@ --import os -+import os, sys - from pathlib import Path - import aiosqlite - import asyncio -@@ -13,6 +13,7 @@ import aiofiles - from .config import NEED_MIGRATION - from .version_manager import VersionManager - from .async_logger import AsyncLogger -+from .utils import get_error_context, create_box_message - # Set up logging - logging.basicConfig(level=logging.INFO) - logger = logging.getLogger(__name__) -@@ -97,35 +98,84 @@ class AsyncDatabaseManager: - - @asynccontextmanager - async def get_connection(self): -- """Connection pool manager""" -+ """Connection pool manager with enhanced error handling""" - if not self._initialized: -- # Use an asyncio.Lock to ensure only one initialization occurs - async with self.init_lock: - if not self._initialized: -- await self.initialize() -- self._initialized = True -+ try: -+ await self.initialize() -+ self._initialized = True -+ except Exception as e: -+ import sys -+ error_context = get_error_context(sys.exc_info()) -+ self.logger.error( -+ message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}", -+ tag="ERROR", -+ force_verbose=True, -+ params={ -+ "error": str(e), -+ "context": error_context["code_context"], -+ "traceback": error_context["full_traceback"] -+ } -+ ) -+ raise - - await self.connection_semaphore.acquire() - task_id = id(asyncio.current_task()) -+ - try: - async with self.pool_lock: - if task_id not in self.connection_pool: -- conn = await aiosqlite.connect( -- self.db_path, -- timeout=30.0 -- ) -- await conn.execute('PRAGMA journal_mode = WAL') -- await conn.execute('PRAGMA busy_timeout = 5000') -- self.connection_pool[task_id] = conn -+ try: -+ conn = await aiosqlite.connect( -+ self.db_path, -+ timeout=30.0 -+ ) -+ await conn.execute('PRAGMA journal_mode = WAL') -+ await conn.execute('PRAGMA busy_timeout = 5000') -+ -+ # Verify database structure -+ async with conn.execute("PRAGMA table_info(crawled_data)") as cursor: -+ columns = await cursor.fetchall() -+ column_names = [col[1] for col in columns] -+ expected_columns = { -+ 'url', 'html', 'cleaned_html', 'markdown', 'extracted_content', -+ 'success', 'media', 'links', 'metadata', 'screenshot', -+ 'response_headers', 'downloaded_files' -+ } -+ missing_columns = expected_columns - set(column_names) -+ if missing_columns: -+ raise ValueError(f"Database missing columns: {missing_columns}") -+ -+ self.connection_pool[task_id] = conn -+ except Exception as e: -+ import sys -+ error_context = get_error_context(sys.exc_info()) -+ error_message = ( -+ f"Unexpected error in db get_connection at line {error_context['line_no']} " -+ f"in {error_context['function']} ({error_context['filename']}):\n" -+ f"Error: {str(e)}\n\n" -+ f"Code context:\n{error_context['code_context']}" -+ ) -+ self.logger.error( -+ message=create_box_message(error_message, type= "error"), -+ ) -+ -+ raise - - yield self.connection_pool[task_id] - - except Exception as e: -+ import sys -+ error_context = get_error_context(sys.exc_info()) -+ error_message = ( -+ f"Unexpected error in db get_connection at line {error_context['line_no']} " -+ f"in {error_context['function']} ({error_context['filename']}):\n" -+ f"Error: {str(e)}\n\n" -+ f"Code context:\n{error_context['code_context']}" -+ ) - self.logger.error( -- message="Connection error: {error}", -- tag="ERROR", -- force_verbose=True, -- params={"error": str(e)} -+ message=create_box_message(error_message, type= "error"), - ) - raise - finally: -@@ -230,7 +280,8 @@ class AsyncDatabaseManager: - 'cleaned_html': row_dict['cleaned_html'], - 'markdown': row_dict['markdown'], - 'extracted_content': row_dict['extracted_content'], -- 'screenshot': row_dict['screenshot'] -+ 'screenshot': row_dict['screenshot'], -+ 'screenshots': row_dict['screenshot'], - } - - for field, hash_value in content_fields.items(): -diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py -index fc6fe82..72ef0bf 100644 ---- a/crawl4ai/async_webcrawler.py -+++ b/crawl4ai/async_webcrawler.py -@@ -1,4 +1,4 @@ --import os -+import os, sys - import time - import warnings - from enum import Enum -@@ -17,7 +17,7 @@ from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawler - from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode - from .content_scraping_strategy import WebScrapingStrategy - from .async_logger import AsyncLogger -- -+from .async_configs import BrowserConfig, CrawlerRunConfig - from .config import ( - MIN_WORD_THRESHOLD, - IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, -@@ -40,31 +40,20 @@ class AsyncWebCrawler: - """ - Asynchronous web crawler with flexible caching capabilities. - -- Migration Guide (from version X.X.X): -+ Migration Guide: - Old way (deprecated): -- crawler = AsyncWebCrawler(always_by_pass_cache=True) -- result = await crawler.arun( -- url="https://example.com", -- bypass_cache=True, -- no_cache_read=True, -- no_cache_write=False -- ) -+ crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) - - New way (recommended): -- crawler = AsyncWebCrawler(always_bypass_cache=True) -- result = await crawler.arun( -- url="https://example.com", -- cache_mode=CacheMode.WRITE_ONLY -- ) -- -- To disable deprecation warnings: -- Pass warning=False to suppress the warning. -+ browser_config = BrowserConfig(browser_type="chromium", headless=True) -+ crawler = AsyncWebCrawler(browser_config=browser_config) - """ - _domain_last_hit = {} - - def __init__( - self, - crawler_strategy: Optional[AsyncCrawlerStrategy] = None, -+ config: Optional[BrowserConfig] = None, - always_bypass_cache: bool = False, - always_by_pass_cache: Optional[bool] = None, # Deprecated parameter - base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), -@@ -75,28 +64,48 @@ class AsyncWebCrawler: - Initialize the AsyncWebCrawler. - - Args: -- crawler_strategy: Strategy for crawling web pages -+ crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy -+ config: Configuration object for browser settings. If None, will be created from kwargs - always_bypass_cache: Whether to always bypass cache (new parameter) - always_by_pass_cache: Deprecated, use always_bypass_cache instead - base_directory: Base directory for storing cache -+ thread_safe: Whether to use thread-safe operations -+ **kwargs: Additional arguments for backwards compatibility - """ -- self.verbose = kwargs.get("verbose", False) -+ # Handle browser configuration -+ browser_config = config -+ if browser_config is not None: -+ if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]): -+ self.logger.warning( -+ message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", -+ tag="WARNING" -+ ) -+ else: -+ # Create browser config from kwargs for backwards compatibility -+ browser_config = BrowserConfig.from_kwargs(kwargs) -+ -+ self.browser_config = browser_config -+ -+ # Initialize logger first since other components may need it - self.logger = AsyncLogger( - log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), -- verbose=self.verbose, -+ verbose=self.browser_config.verbose, - tag_width=10 - ) -+ - -+ # Initialize crawler strategy - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( -- logger = self.logger, -- **kwargs -+ browser_config=browser_config, -+ logger=self.logger, -+ **kwargs # Pass remaining kwargs for backwards compatibility - ) - -- # Handle deprecated parameter -+ # Handle deprecated cache parameter - if always_by_pass_cache is not None: - if kwargs.get("warning", True): - warnings.warn( -- "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " -+ "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " - "Use 'always_bypass_cache' instead. " - "Pass warning=False to suppress this warning.", - DeprecationWarning, -@@ -106,13 +115,15 @@ class AsyncWebCrawler: - else: - self.always_bypass_cache = always_bypass_cache - -+ # Thread safety setup - self._lock = asyncio.Lock() if thread_safe else None - -+ # Initialize directories - self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") - os.makedirs(self.crawl4ai_folder, exist_ok=True) - os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) -+ - self.ready = False -- self.verbose = kwargs.get("verbose", False) - - async def __aenter__(self): - await self.crawler_strategy.__aenter__() -@@ -131,197 +142,198 @@ class AsyncWebCrawler: - self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") - self.ready = True - -- async def arun( -- self, -- url: str, -- word_count_threshold=MIN_WORD_THRESHOLD, -- extraction_strategy: ExtractionStrategy = None, -- chunking_strategy: ChunkingStrategy = RegexChunking(), -- content_filter: RelevantContentFilter = None, -- cache_mode: Optional[CacheMode] = None, -- # Deprecated parameters -- bypass_cache: bool = False, -- disable_cache: bool = False, -- no_cache_read: bool = False, -- no_cache_write: bool = False, -- # Other parameters -- css_selector: str = None, -- screenshot: bool = False, -- pdf: bool = False, -- user_agent: str = None, -- verbose=True, -- **kwargs, -- ) -> CrawlResult: -- """ -- Runs the crawler for a single source: URL (web, local file, or raw HTML). - -- Migration from legacy cache parameters: -+ async def arun( -+ self, -+ url: str, -+ config: Optional[CrawlerRunConfig] = None, -+ # Legacy parameters maintained for backwards compatibility -+ word_count_threshold=MIN_WORD_THRESHOLD, -+ extraction_strategy: ExtractionStrategy = None, -+ chunking_strategy: ChunkingStrategy = RegexChunking(), -+ content_filter: RelevantContentFilter = None, -+ cache_mode: Optional[CacheMode] = None, -+ # Deprecated cache parameters -+ bypass_cache: bool = False, -+ disable_cache: bool = False, -+ no_cache_read: bool = False, -+ no_cache_write: bool = False, -+ # Other legacy parameters -+ css_selector: str = None, -+ screenshot: bool = False, -+ pdf: bool = False, -+ user_agent: str = None, -+ verbose=True, -+ **kwargs, -+ ) -> CrawlResult: -+ """ -+ Runs the crawler for a single source: URL (web, local file, or raw HTML). -+ -+ Migration Guide: - Old way (deprecated): -- await crawler.arun(url, bypass_cache=True, no_cache_read=True) -+ result = await crawler.arun( -+ url="https://example.com", -+ word_count_threshold=200, -+ screenshot=True, -+ ... -+ ) - -- New way: -- await crawler.arun(url, cache_mode=CacheMode.BYPASS) -+ New way (recommended): -+ config = CrawlerRunConfig( -+ word_count_threshold=200, -+ screenshot=True, -+ ... -+ ) -+ result = await crawler.arun(url="https://example.com", crawler_config=config) - -- Args: -- url: The URL to crawl (http://, https://, file://, or raw:) -- cache_mode: Cache behavior control (recommended) -- word_count_threshold: Minimum word count threshold -- extraction_strategy: Strategy for content extraction -- chunking_strategy: Strategy for content chunking -- css_selector: CSS selector for content extraction -- screenshot: Whether to capture screenshot -- user_agent: Custom user agent -- verbose: Enable verbose logging -+ Args: -+ url: The URL to crawl (http://, https://, file://, or raw:) -+ crawler_config: Configuration object controlling crawl behavior -+ [other parameters maintained for backwards compatibility] - -- Deprecated Args: -- bypass_cache: Use cache_mode=CacheMode.BYPASS instead -- disable_cache: Use cache_mode=CacheMode.DISABLED instead -- no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead -- no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead -- -- Returns: -- CrawlResult: The result of crawling and processing -- """ -- # Check if url is not string and is not empty -- if not isinstance(url, str) or not url: -- raise ValueError("Invalid URL, make sure the URL is a non-empty string") -- -- async with self._lock or self.nullcontext(): # Lock for thread safety previously -> nullcontext(): -- try: -- # Handle deprecated parameters -- if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): -- if kwargs.get("warning", True): -- warnings.warn( -- "Cache control boolean flags are deprecated and will be removed in version X.X.X. " -- "Use 'cache_mode' parameter instead. Examples:\n" -- "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" -- "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" -- "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" -- "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" -- "Pass warning=False to suppress this warning.", -- DeprecationWarning, -- stacklevel=2 -- ) -+ Returns: -+ CrawlResult: The result of crawling and processing -+ """ -+ crawler_config = config -+ if not isinstance(url, str) or not url: -+ raise ValueError("Invalid URL, make sure the URL is a non-empty string") -+ -+ async with self._lock or self.nullcontext(): -+ try: -+ # Handle configuration -+ if crawler_config is not None: -+ if any(param is not None for param in [ -+ word_count_threshold, extraction_strategy, chunking_strategy, -+ content_filter, cache_mode, css_selector, screenshot, pdf -+ ]): -+ self.logger.warning( -+ message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", -+ tag="WARNING" -+ ) -+ config = crawler_config -+ else: -+ # Merge all parameters into a single kwargs dict for config creation -+ config_kwargs = { -+ "word_count_threshold": word_count_threshold, -+ "extraction_strategy": extraction_strategy, -+ "chunking_strategy": chunking_strategy, -+ "content_filter": content_filter, -+ "cache_mode": cache_mode, -+ "bypass_cache": bypass_cache, -+ "disable_cache": disable_cache, -+ "no_cache_read": no_cache_read, -+ "no_cache_write": no_cache_write, -+ "css_selector": css_selector, -+ "screenshot": screenshot, -+ "pdf": pdf, -+ "verbose": verbose, -+ **kwargs -+ } -+ config = CrawlerRunConfig.from_kwargs(config_kwargs) -+ -+ # Handle deprecated cache parameters -+ if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): -+ if kwargs.get("warning", True): -+ warnings.warn( -+ "Cache control boolean flags are deprecated and will be removed in version 0.5.0. " -+ "Use 'cache_mode' parameter instead.", -+ DeprecationWarning, -+ stacklevel=2 -+ ) -+ -+ # Convert legacy parameters if cache_mode not provided -+ if config.cache_mode is None: -+ config.cache_mode = _legacy_to_cache_mode( -+ disable_cache=disable_cache, -+ bypass_cache=bypass_cache, -+ no_cache_read=no_cache_read, -+ no_cache_write=no_cache_write -+ ) - -- # Convert legacy parameters if cache_mode not provided -- if cache_mode is None: -- cache_mode = _legacy_to_cache_mode( -- disable_cache=disable_cache, -- bypass_cache=bypass_cache, -- no_cache_read=no_cache_read, -- no_cache_write=no_cache_write -- ) -- -- # Default to ENABLED if no cache mode specified -- if cache_mode is None: -- cache_mode = CacheMode.ENABLED -- -- # Create cache context -- cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) -- -- extraction_strategy = extraction_strategy or NoExtractionStrategy() -- extraction_strategy.verbose = verbose -- if not isinstance(extraction_strategy, ExtractionStrategy): -- raise ValueError("Unsupported extraction strategy") -- if not isinstance(chunking_strategy, ChunkingStrategy): -- raise ValueError("Unsupported chunking strategy") -- -- word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) -- -- async_response: AsyncCrawlResponse = None -- cached_result = None -- screenshot_data = None -- pdf_data = None -- extracted_content = None -- -- start_time = time.perf_counter() -- -- # Try to get cached result if appropriate -- if cache_context.should_read(): -- cached_result = await async_db_manager.aget_cached_url(url) -- -- if cached_result: -- html = sanitize_input_encode(cached_result.html) -- extracted_content = sanitize_input_encode(cached_result.extracted_content or "") -- if screenshot: -+ # Default to ENABLED if no cache mode specified -+ if config.cache_mode is None: -+ config.cache_mode = CacheMode.ENABLED -+ -+ # Create cache context -+ cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache) -+ -+ # Initialize processing variables -+ async_response: AsyncCrawlResponse = None -+ cached_result = None -+ screenshot_data = None -+ pdf_data = None -+ extracted_content = None -+ start_time = time.perf_counter() -+ -+ # Try to get cached result if appropriate -+ if cache_context.should_read(): -+ cached_result = await async_db_manager.aget_cached_url(url) -+ -+ if cached_result: -+ html = sanitize_input_encode(cached_result.html) -+ extracted_content = sanitize_input_encode(cached_result.extracted_content or "") -+ # If screenshot is requested but its not in cache, then set cache_result to None - screenshot_data = cached_result.screenshot -- if not screenshot_data: -- cached_result = None -- if pdf: - pdf_data = cached_result.pdf -- if not pdf_data: -+ if config.screenshot and not screenshot or config.pdf and not pdf: - cached_result = None -- # if verbose: -- # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") -- self.logger.url_status( -+ -+ self.logger.url_status( - url=cache_context.display_url, - success=bool(html), - timing=time.perf_counter() - start_time, - tag="FETCH" -- ) -+ ) - -+ # Fetch fresh content if needed -+ if not cached_result or not html: -+ t1 = time.perf_counter() -+ -+ if user_agent: -+ self.crawler_strategy.update_user_agent(user_agent) -+ -+ # Pass config to crawl method -+ async_response = await self.crawler_strategy.crawl( -+ url, -+ config=config # Pass the entire config object -+ ) -+ -+ html = sanitize_input_encode(async_response.html) -+ screenshot_data = async_response.screenshot -+ pdf_data = async_response.pdf_data -+ -+ t2 = time.perf_counter() -+ self.logger.url_status( -+ url=cache_context.display_url, -+ success=bool(html), -+ timing=t2 - t1, -+ tag="FETCH" -+ ) - -- # Fetch fresh content if needed -- if not cached_result or not html: -- t1 = time.perf_counter() -- -- if user_agent: -- self.crawler_strategy.update_user_agent(user_agent) -- async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( -- url, -- screenshot=screenshot, -- pdf=pdf, -- **kwargs -- ) -- html = sanitize_input_encode(async_response.html) -- screenshot_data = async_response.screenshot -- pdf_data = async_response.pdf_data -- t2 = time.perf_counter() -- self.logger.url_status( -- url=cache_context.display_url, -- success=bool(html), -- timing=t2 - t1, -- tag="FETCH" -+ # Process the HTML content -+ crawl_result = await self.aprocess_html( -+ url=url, -+ html=html, -+ extracted_content=extracted_content, -+ config=config, # Pass the config object instead of individual parameters -+ screenshot=screenshot_data, -+ pdf_data=pdf_data, -+ verbose=config.verbose - ) -- # if verbose: -- # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") -- -- # Process the HTML content -- crawl_result = await self.aprocess_html( -- url=url, -- html=html, -- extracted_content=extracted_content, -- word_count_threshold=word_count_threshold, -- extraction_strategy=extraction_strategy, -- chunking_strategy=chunking_strategy, -- content_filter=content_filter, -- css_selector=css_selector, -- screenshot=screenshot_data, -- pdf_data=pdf_data, -- verbose=verbose, -- is_cached=bool(cached_result), -- async_response=async_response, -- is_web_url=cache_context.is_web_url, -- is_local_file=cache_context.is_local_file, -- is_raw_html=cache_context.is_raw_html, -- **kwargs, -- ) -- -- # Set response data -- if async_response: -- crawl_result.status_code = async_response.status_code -- crawl_result.response_headers = async_response.response_headers -- crawl_result.downloaded_files = async_response.downloaded_files -- else: -- crawl_result.status_code = 200 -- crawl_result.response_headers = cached_result.response_headers if cached_result else {} - -- crawl_result.success = bool(html) -- crawl_result.session_id = kwargs.get("session_id", None) -+ # Set response data -+ if async_response: -+ crawl_result.status_code = async_response.status_code -+ crawl_result.response_headers = async_response.response_headers -+ crawl_result.downloaded_files = async_response.downloaded_files -+ else: -+ crawl_result.status_code = 200 -+ crawl_result.response_headers = cached_result.response_headers if cached_result else {} -+ -+ crawl_result.success = bool(html) -+ crawl_result.session_id = getattr(config, 'session_id', None) - -- # if verbose: -- # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") -- self.logger.success( -+ self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ -@@ -335,254 +347,312 @@ class AsyncWebCrawler: - } - ) - -- # Update cache if appropriate -- if cache_context.should_write() and not bool(cached_result): -- await async_db_manager.acache_url(crawl_result) -+ # Update cache if appropriate -+ if cache_context.should_write() and not bool(cached_result): -+ await async_db_manager.acache_url(crawl_result) -+ -+ return crawl_result - -- return crawl_result -+ except Exception as e: -+ error_context = get_error_context(sys.exc_info()) -+ -+ error_message = ( -+ f"Unexpected error in _crawl_web at line {error_context['line_no']} " -+ f"in {error_context['function']} ({error_context['filename']}):\n" -+ f"Error: {str(e)}\n\n" -+ f"Code context:\n{error_context['code_context']}" -+ ) -+ # if not hasattr(e, "msg"): -+ # e.msg = str(e) -+ -+ self.logger.error_status( -+ url=url, -+ error=create_box_message(error_message, type="error"), -+ tag="ERROR" -+ ) -+ -+ return CrawlResult( -+ url=url, -+ html="", -+ success=False, -+ error_message=error_message -+ ) -+ -+ async def aprocess_html( -+ self, -+ url: str, -+ html: str, -+ extracted_content: str, -+ config: CrawlerRunConfig, -+ screenshot: str, -+ pdf_data: str, -+ verbose: bool, -+ **kwargs, -+ ) -> CrawlResult: -+ """ -+ Process HTML content using the provided configuration. -+ -+ Args: -+ url: The URL being processed -+ html: Raw HTML content -+ extracted_content: Previously extracted content (if any) -+ config: Configuration object controlling processing behavior -+ screenshot: Screenshot data (if any) -+ verbose: Whether to enable verbose logging -+ **kwargs: Additional parameters for backwards compatibility - -+ Returns: -+ CrawlResult: Processed result containing extracted and formatted content -+ """ -+ try: -+ _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" -+ t1 = time.perf_counter() -+ -+ # Initialize scraping strategy -+ scrapping_strategy = WebScrapingStrategy(logger=self.logger) -+ -+ # Process HTML content -+ result = scrapping_strategy.scrap( -+ url, -+ html, -+ word_count_threshold=config.word_count_threshold, -+ css_selector=config.css_selector, -+ only_text=config.only_text, -+ image_description_min_word_threshold=config.image_description_min_word_threshold, -+ content_filter=config.content_filter -+ ) -+ -+ if result is None: -+ raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") -+ -+ except InvalidCSSSelectorError as e: -+ raise ValueError(str(e)) - except Exception as e: -- if not hasattr(e, "msg"): -- e.msg = str(e) -- # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") -+ raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") -+ -+ # Extract results -+ markdown_v2 = result.get("markdown_v2", None) -+ cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) -+ markdown = sanitize_input_encode(result.get("markdown", "")) -+ fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) -+ fit_html = sanitize_input_encode(result.get("fit_html", "")) -+ media = result.get("media", []) -+ links = result.get("links", []) -+ metadata = result.get("metadata", {}) -+ -+ # Log processing completion -+ self.logger.info( -+ message="Processed {url:.50}... | Time: {timing}ms", -+ tag="SCRAPE", -+ params={ -+ "url": _url, -+ "timing": int((time.perf_counter() - t1) * 1000) -+ } -+ ) -+ -+ # Handle content extraction if needed -+ if (extracted_content is None and -+ config.extraction_strategy and -+ config.chunking_strategy and -+ not isinstance(config.extraction_strategy, NoExtractionStrategy)): - -- self.logger.error_status( -- # url=cache_context.display_url, -- url=url, -- error=create_box_message(e.msg, type = "error"), -- tag="ERROR" -- ) -- return CrawlResult( -- url=url, -- html="", -- success=False, -- error_message=e.msg -+ t1 = time.perf_counter() -+ -+ # Handle different extraction strategy types -+ if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonCssExtractionStrategy)): -+ config.extraction_strategy.verbose = verbose -+ extracted_content = config.extraction_strategy.run(url, [html]) -+ extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) -+ else: -+ sections = config.chunking_strategy.chunk(markdown) -+ extracted_content = config.extraction_strategy.run(url, sections) -+ extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) -+ -+ # Log extraction completion -+ self.logger.info( -+ message="Completed for {url:.50}... | Time: {timing}s", -+ tag="EXTRACT", -+ params={ -+ "url": _url, -+ "timing": time.perf_counter() - t1 -+ } - ) - -- async def arun_many( -- self, -- urls: List[str], -- word_count_threshold=MIN_WORD_THRESHOLD, -- extraction_strategy: ExtractionStrategy = None, -- chunking_strategy: ChunkingStrategy = RegexChunking(), -- content_filter: RelevantContentFilter = None, -- cache_mode: Optional[CacheMode] = None, -- # Deprecated parameters -- bypass_cache: bool = False, -- css_selector: str = None, -- screenshot: bool = False, -- pdf: bool = False, -- user_agent: str = None, -- verbose=True, -- **kwargs, -- ) -> List[CrawlResult]: -- """ -- Runs the crawler for multiple URLs concurrently. -+ # Handle screenshot and PDF data -+ screenshot_data = None if not screenshot else screenshot -+ pdf_data = None if not pdf_data else pdf_data -+ -+ # Apply HTML formatting if requested -+ if config.prettiify: -+ cleaned_html = fast_format_html(cleaned_html) -+ -+ # Return complete crawl result -+ return CrawlResult( -+ url=url, -+ html=html, -+ cleaned_html=cleaned_html, -+ markdown_v2=markdown_v2, -+ markdown=markdown, -+ fit_markdown=fit_markdown, -+ fit_html=fit_html, -+ media=media, -+ links=links, -+ metadata=metadata, -+ screenshot=screenshot_data, -+ pdf=pdf_data, -+ extracted_content=extracted_content, -+ success=True, -+ error_message="", -+ ) - -- Migration from legacy parameters: -+ async def arun_many( -+ self, -+ urls: List[str], -+ config: Optional[CrawlerRunConfig] = None, -+ # Legacy parameters maintained for backwards compatibility -+ word_count_threshold=MIN_WORD_THRESHOLD, -+ extraction_strategy: ExtractionStrategy = None, -+ chunking_strategy: ChunkingStrategy = RegexChunking(), -+ content_filter: RelevantContentFilter = None, -+ cache_mode: Optional[CacheMode] = None, -+ bypass_cache: bool = False, -+ css_selector: str = None, -+ screenshot: bool = False, -+ pdf: bool = False, -+ user_agent: str = None, -+ verbose=True, -+ **kwargs, -+ ) -> List[CrawlResult]: -+ """ -+ Runs the crawler for multiple URLs concurrently. -+ -+ Migration Guide: - Old way (deprecated): -- results = await crawler.arun_many(urls, bypass_cache=True) -+ results = await crawler.arun_many( -+ urls, -+ word_count_threshold=200, -+ screenshot=True, -+ ... -+ ) - -- New way: -- results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS) -- -- Args: -- urls: List of URLs to crawl -- cache_mode: Cache behavior control (recommended) -- [other parameters same as arun()] -- -- Returns: -- List[CrawlResult]: Results for each URL -- """ -- if bypass_cache: -- if kwargs.get("warning", True): -- warnings.warn( -- "'bypass_cache' is deprecated and will be removed in version X.X.X. " -- "Use 'cache_mode=CacheMode.BYPASS' instead. " -- "Pass warning=False to suppress this warning.", -- DeprecationWarning, -- stacklevel=2 -+ New way (recommended): -+ config = CrawlerRunConfig( -+ word_count_threshold=200, -+ screenshot=True, -+ ... - ) -- if cache_mode is None: -- cache_mode = CacheMode.BYPASS -- -- semaphore_count = kwargs.get('semaphore_count', 10) -- semaphore = asyncio.Semaphore(semaphore_count) -+ results = await crawler.arun_many(urls, crawler_config=config) - -- async def crawl_with_semaphore(url): -- domain = urlparse(url).netloc -- current_time = time.time() -- -- # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") -- self.logger.debug( -- message="Started task for {url:.50}...", -- tag="PARALLEL", -- params={"url": url} -- ) -- -- # Get delay settings from kwargs or use defaults -- mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay -- max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay -- -- # Check if we need to wait -- if domain in self._domain_last_hit: -- time_since_last = current_time - self._domain_last_hit[domain] -- if time_since_last < mean_delay: -- delay = mean_delay + random.uniform(0, max_range) -- await asyncio.sleep(delay) -+ Args: -+ urls: List of URLs to crawl -+ crawler_config: Configuration object controlling crawl behavior for all URLs -+ [other parameters maintained for backwards compatibility] - -- # Update last hit time -- self._domain_last_hit[domain] = current_time -- -- async with semaphore: -- return await self.arun( -- url, -- word_count_threshold=word_count_threshold, -- extraction_strategy=extraction_strategy, -- chunking_strategy=chunking_strategy, -- content_filter=content_filter, -- cache_mode=cache_mode, -- css_selector=css_selector, -- screenshot=screenshot, -- user_agent=user_agent, -- verbose=verbose, -- **kwargs, -- ) -+ Returns: -+ List[CrawlResult]: Results for each URL -+ """ -+ crawler_config = config -+ # Handle configuration -+ if crawler_config is not None: -+ if any(param is not None for param in [ -+ word_count_threshold, extraction_strategy, chunking_strategy, -+ content_filter, cache_mode, css_selector, screenshot, pdf -+ ]): -+ self.logger.warning( -+ message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", -+ tag="WARNING" -+ ) -+ config = crawler_config -+ else: -+ # Merge all parameters into a single kwargs dict for config creation -+ config_kwargs = { -+ "word_count_threshold": word_count_threshold, -+ "extraction_strategy": extraction_strategy, -+ "chunking_strategy": chunking_strategy, -+ "content_filter": content_filter, -+ "cache_mode": cache_mode, -+ "bypass_cache": bypass_cache, -+ "css_selector": css_selector, -+ "screenshot": screenshot, -+ "pdf": pdf, -+ "verbose": verbose, -+ **kwargs -+ } -+ config = CrawlerRunConfig.from_kwargs(config_kwargs) -+ -+ if bypass_cache: -+ if kwargs.get("warning", True): -+ warnings.warn( -+ "'bypass_cache' is deprecated and will be removed in version 0.5.0. " -+ "Use 'cache_mode=CacheMode.BYPASS' instead. " -+ "Pass warning=False to suppress this warning.", -+ DeprecationWarning, -+ stacklevel=2 -+ ) -+ if config.cache_mode is None: -+ config.cache_mode = CacheMode.BYPASS - -- # Print start message -- # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") -- self.logger.info( -- message="Starting concurrent crawling for {count} URLs...", -- tag="INIT", -- params={"count": len(urls)} -- ) -- start_time = time.perf_counter() -- tasks = [crawl_with_semaphore(url) for url in urls] -- results = await asyncio.gather(*tasks, return_exceptions=True) -- end_time = time.perf_counter() -- # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") -- self.logger.success( -- message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL, -- tag="COMPLETE", -- params={ -- "count": len(urls), -- "timing": f"{end_time - start_time:.2f}s" -- }, -- colors={"timing": Fore.YELLOW} -- ) -- return [result if not isinstance(result, Exception) else str(result) for result in results] -+ semaphore_count = config.semaphore_count or 5 -+ semaphore = asyncio.Semaphore(semaphore_count) - -+ async def crawl_with_semaphore(url): -+ # Handle rate limiting per domain -+ domain = urlparse(url).netloc -+ current_time = time.time() -+ -+ self.logger.debug( -+ message="Started task for {url:.50}...", -+ tag="PARALLEL", -+ params={"url": url} -+ ) - -- async def aprocess_html( -- self, -- url: str, -- html: str, -- extracted_content: str, -- word_count_threshold: int, -- extraction_strategy: ExtractionStrategy, -- chunking_strategy: ChunkingStrategy, -- content_filter: RelevantContentFilter, -- css_selector: str, -- screenshot: str, -- verbose: bool, -- **kwargs, -- ) -> CrawlResult: -- # Extract content from HTML -- try: -- _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" -- t1 = time.perf_counter() -- scrapping_strategy = WebScrapingStrategy( -- logger=self.logger, -- ) -- # result = await scrapping_strategy.ascrap( -- result = scrapping_strategy.scrap( -- url, -- html, -- word_count_threshold=word_count_threshold, -- css_selector=css_selector, -- only_text=kwargs.pop("only_text", False), -- image_description_min_word_threshold=kwargs.pop( -- "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD -- ), -- content_filter = content_filter, -- **kwargs, -- ) -+ # Get delay settings from config -+ mean_delay = config.mean_delay -+ max_range = config.max_range -+ -+ # Apply rate limiting -+ if domain in self._domain_last_hit: -+ time_since_last = current_time - self._domain_last_hit[domain] -+ if time_since_last < mean_delay: -+ delay = mean_delay + random.uniform(0, max_range) -+ await asyncio.sleep(delay) -+ -+ self._domain_last_hit[domain] = current_time - -- if result is None: -- raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") -- except InvalidCSSSelectorError as e: -- raise ValueError(str(e)) -- except Exception as e: -- raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") -+ async with semaphore: -+ return await self.arun( -+ url, -+ crawler_config=config, # Pass the entire config object -+ user_agent=user_agent # Maintain user_agent override capability -+ ) - -- markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None) -- -- cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) -- markdown = sanitize_input_encode(result.get("markdown", "")) -- fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) -- fit_html = sanitize_input_encode(result.get("fit_html", "")) -- media = result.get("media", []) -- links = result.get("links", []) -- metadata = result.get("metadata", {}) -- -- # if verbose: -- # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") -- self.logger.info( -- message="Processed {url:.50}... | Time: {timing}ms", -- tag="SCRAPE", -- params={ -- "url": _url, -- "timing": int((time.perf_counter() - t1) * 1000) -- } -- ) -+ # Log start of concurrent crawling -+ self.logger.info( -+ message="Starting concurrent crawling for {count} URLs...", -+ tag="INIT", -+ params={"count": len(urls)} -+ ) - -+ # Execute concurrent crawls -+ start_time = time.perf_counter() -+ tasks = [crawl_with_semaphore(url) for url in urls] -+ results = await asyncio.gather(*tasks, return_exceptions=True) -+ end_time = time.perf_counter() - -- if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): -- t1 = time.perf_counter() -- # Check if extraction strategy is type of JsonCssExtractionStrategy -- if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): -- extraction_strategy.verbose = verbose -- extracted_content = extraction_strategy.run(url, [html]) -- extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) -- else: -- sections = chunking_strategy.chunk(markdown) -- extracted_content = extraction_strategy.run(url, sections) -- extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) -- # if verbose: -- # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") -- self.logger.info( -- message="Completed for {url:.50}... | Time: {timing}s", -- tag="EXTRACT", -+ # Log completion -+ self.logger.success( -+ message="Concurrent crawling completed for {count} URLs | Total time: {timing}", -+ tag="COMPLETE", - params={ -- "url": _url, -- "timing": time.perf_counter() - t1 -+ "count": len(urls), -+ "timing": f"{end_time - start_time:.2f}s" -+ }, -+ colors={ -+ "timing": Fore.YELLOW - } - ) - -- screenshot = None if not screenshot else screenshot -- pdf_data = kwargs.get("pdf_data", None) -- -- -- if kwargs.get("prettiify", False): -- cleaned_html = fast_format_html(cleaned_html) -- -- return CrawlResult( -- url=url, -- html=html, -- cleaned_html=cleaned_html, -- markdown_v2=markdown_v2, -- markdown=markdown, -- fit_markdown=fit_markdown, -- fit_html= fit_html, -- media=media, -- links=links, -- metadata=metadata, -- screenshot=screenshot, -- pdf=pdf_data, -- extracted_content=extracted_content, -- success=True, -- error_message="", -- ) -+ return [result if not isinstance(result, Exception) else str(result) for result in results] - - async def aclear_cache(self): - """Clear the cache database.""" -diff --git a/crawl4ai/config.py b/crawl4ai/config.py -index e17ff34..7c8a931 100644 ---- a/crawl4ai/config.py -+++ b/crawl4ai/config.py -@@ -57,4 +57,6 @@ MAX_METRICS_HISTORY = 1000 - NEED_MIGRATION = True - URL_LOG_SHORTEN_LENGTH = 30 - SHOW_DEPRECATION_WARNINGS = True --SCREENSHOT_HEIGHT_TRESHOLD = 10000 -\ No newline at end of file -+SCREENSHOT_HEIGHT_TRESHOLD = 10000 -+PAGE_TIMEOUT=60000 -+DOWNLOAD_PAGE_TIMEOUT=60000 -\ No newline at end of file -diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py -index 8a12ff0..7ecc22d 100644 ---- a/crawl4ai/utils.py -+++ b/crawl4ai/utils.py -@@ -29,7 +29,7 @@ class InvalidCSSSelectorError(Exception): - def create_box_message( - message: str, - type: str = "info", -- width: int = 80, -+ width: int = 120, - add_newlines: bool = True, - double_line: bool = False - ) -> str: -@@ -1223,7 +1223,8 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: - 'cleaned': 'cleaned_html', - 'markdown': 'markdown_content', - 'extracted': 'extracted_content', -- 'screenshots': 'screenshots' -+ 'screenshots': 'screenshots', -+ 'screenshot': 'screenshots' - } - - content_paths = {} -@@ -1232,4 +1233,60 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]: - os.makedirs(path, exist_ok=True) - content_paths[key] = path - -- return content_paths -\ No newline at end of file -+ return content_paths -+ -+def get_error_context(exc_info, context_lines: int = 5): -+ """ -+ Extract error context with more reliable line number tracking. -+ -+ Args: -+ exc_info: The exception info from sys.exc_info() -+ context_lines: Number of lines to show before and after the error -+ -+ Returns: -+ dict: Error context information -+ """ -+ import traceback -+ import linecache -+ import os -+ -+ # Get the full traceback -+ tb = traceback.extract_tb(exc_info[2]) -+ -+ # Get the last frame (where the error occurred) -+ last_frame = tb[-1] -+ filename = last_frame.filename -+ line_no = last_frame.lineno -+ func_name = last_frame.name -+ -+ # Get the source code context using linecache -+ # This is more reliable than inspect.getsourcelines -+ context_start = max(1, line_no - context_lines) -+ context_end = line_no + context_lines + 1 -+ -+ # Build the context lines with line numbers -+ context_lines = [] -+ for i in range(context_start, context_end): -+ line = linecache.getline(filename, i) -+ if line: -+ # Remove any trailing whitespace/newlines and add the pointer for error line -+ line = line.rstrip() -+ pointer = '→' if i == line_no else ' ' -+ context_lines.append(f"{i:4d} {pointer} {line}") -+ -+ # Join the lines with newlines -+ code_context = '\n'.join(context_lines) -+ -+ # Get relative path for cleaner output -+ try: -+ rel_path = os.path.relpath(filename) -+ except ValueError: -+ # Fallback if relpath fails (can happen on Windows with different drives) -+ rel_path = filename -+ -+ return { -+ "filename": rel_path, -+ "line_no": line_no, -+ "function": func_name, -+ "code_context": code_context -+ } -\ No newline at end of file -diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md -index 04a4f21..01cfe34 100644 ---- a/docs/md_v2/basic/cache-modes.md -+++ b/docs/md_v2/basic/cache-modes.md -@@ -1,7 +1,7 @@ - # Crawl4AI Cache System and Migration Guide - - ## Overview --Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. -+Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. - - ## Old vs New Approach - diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index c9c35576..73e5c025 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.22" +__version__ = "0.4.24" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index aa0b849e..4d85bc8f 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,13 +1,18 @@ from .config import ( - MIN_WORD_THRESHOLD, + MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, SCREENSHOT_HEIGHT_TRESHOLD, - PAGE_TIMEOUT + PAGE_TIMEOUT, + IMAGE_SCORE_THRESHOLD, + SOCIAL_MEDIA_DOMAINS, + ) from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy +from typing import Union, List + class BrowserConfig: """ @@ -24,6 +29,7 @@ class BrowserConfig: Default: True. use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. + debugging_port (int): Port for the browser debugging protocol. Default: 9222. use_persistent_context (bool): Use a persistent browser context (like a persistent profile). Automatically sets use_managed_browser=True. Default: False. user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a @@ -34,8 +40,8 @@ class BrowserConfig: Default: None. proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. - viewport_width (int): Default viewport width for pages. Default: 1920. - viewport_height (int): Default viewport height for pages. Default: 1080. + viewport_width (int): Default viewport width for pages. Default: 1080. + viewport_height (int): Default viewport height for pages. Default: 600. verbose (bool): Enable verbose logging. Default: True. accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path. @@ -57,7 +63,7 @@ class BrowserConfig: user_agent as-is. Default: None. user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. Default: None. - text_only (bool): If True, disables images and other rich content for potentially faster load times. + text_mode (bool): If True, disables images and other rich content for potentially faster load times. Default: False. light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. @@ -74,8 +80,8 @@ class BrowserConfig: chrome_channel: str = "chrome", proxy: str = None, proxy_config: dict = None, - viewport_width: int = 1920, - viewport_height: int = 1080, + viewport_width: int = 1080, + viewport_height: int = 600, accept_downloads: bool = False, downloads_path: str = None, storage_state=None, @@ -91,9 +97,10 @@ class BrowserConfig: ), user_agent_mode: str = None, user_agent_generator_config: dict = None, - text_only: bool = False, + text_mode: bool = False, light_mode: bool = False, extra_args: list = None, + debugging_port : int = 9222, ): self.browser_type = browser_type self.headless = headless @@ -122,17 +129,23 @@ class BrowserConfig: self.user_agent = user_agent self.user_agent_mode = user_agent_mode self.user_agent_generator_config = user_agent_generator_config - self.text_only = text_only + self.text_mode = text_mode self.light_mode = light_mode self.extra_args = extra_args if extra_args is not None else [] self.sleep_on_close = sleep_on_close self.verbose = verbose - + self.debugging_port = debugging_port + user_agenr_generator = UserAgentGenerator() - if self.user_agent_mode != "random": + if self.user_agent_mode != "random" and self.user_agent_generator_config: self.user_agent = user_agenr_generator.generate( **(self.user_agent_generator_config or {}) ) + elif self.user_agent_mode == "random": + self.user_agent = user_agenr_generator.generate() + else: + pass + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) @@ -151,8 +164,8 @@ class BrowserConfig: chrome_channel=kwargs.get("chrome_channel", "chrome"), proxy=kwargs.get("proxy"), proxy_config=kwargs.get("proxy_config"), - viewport_width=kwargs.get("viewport_width", 1920), - viewport_height=kwargs.get("viewport_height", 1080), + viewport_width=kwargs.get("viewport_width", 1080), + viewport_height=kwargs.get("viewport_height", 600), accept_downloads=kwargs.get("accept_downloads", False), downloads_path=kwargs.get("downloads_path"), storage_state=kwargs.get("storage_state"), @@ -160,15 +173,16 @@ class BrowserConfig: java_script_enabled=kwargs.get("java_script_enabled", True), cookies=kwargs.get("cookies", []), headers=kwargs.get("headers", {}), - user_agent=kwargs.get("user_agent", + user_agent=kwargs.get( + "user_agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", ), user_agent_mode=kwargs.get("user_agent_mode"), user_agent_generator_config=kwargs.get("user_agent_generator_config"), - text_only=kwargs.get("text_only", False), + text_mode=kwargs.get("text_mode", False), light_mode=kwargs.get("light_mode", False), - extra_args=kwargs.get("extra_args", []) + extra_args=kwargs.get("extra_args", []), ) @@ -182,22 +196,41 @@ class CrawlerRunConfig: By using this class, you have a single place to understand and adjust the crawling options. Attributes: + # Content Processing Parameters word_count_threshold (int): Minimum word count threshold before processing content. Default: MIN_WORD_THRESHOLD (typically 200). extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages. Default: None (NoExtractionStrategy is used if None). chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction. Default: RegexChunking(). + markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. + Default: None. content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content. Default: None. + only_text (bool): If True, attempt to extract text-only content where applicable. + Default: False. + css_selector (str or None): CSS selector to extract a specific portion of the page. + Default: None. + excluded_tags (list of str or None): List of HTML tags to exclude from processing. + Default: None. + excluded_selector (str or None): CSS selector to exclude from processing. + Default: None. + keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. + Default: False. + remove_forms (bool): If True, remove all `
` elements from the HTML. + Default: False. + prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output. + Default: False. + parser_type (str): Type of parser to use for HTML parsing. + Default: "lxml". + + # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. If None, defaults to CacheMode.ENABLED internally. Default: None. - session_id (str or None): Optional session ID to persist the browser context and the created - page instance. If the ID already exists, the crawler does not - create a new page and uses the current page to preserve the state; - if not, it creates a new page and context then stores it in - memory with the given session ID. + session_id (str or None): Optional session ID to persist the browser context and the created + page instance. If the ID already exists, the crawler does not + create a new page and uses the current page to preserve the state. bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS. Default: False. disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED. @@ -206,36 +239,32 @@ class CrawlerRunConfig: Default: False. no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY. Default: False. - css_selector (str or None): CSS selector to extract a specific portion of the page. - Default: None. - screenshot (bool): Whether to take a screenshot after crawling. - Default: False. - pdf (bool): Whether to generate a PDF of the page. - Default: False. - verbose (bool): Enable verbose logging. - Default: True. - only_text (bool): If True, attempt to extract text-only content where applicable. - Default: False. - image_description_min_word_threshold (int): Minimum words for image description extraction. - Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50). - prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output. - Default: False. - js_code (str or list of str or None): JavaScript code/snippets to run on the page. - Default: None. - wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. - Default: None. - js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads. - Default: False. + + # Page Navigation and Timing Parameters wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded". Default: "domcontentloaded". page_timeout (int): Timeout in ms for page operations like navigation. Default: 60000 (60 seconds). + wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. + Default: None. + wait_for_images (bool): If True, wait for images to load before extracting content. + Default: True. + delay_before_return_html (float): Delay in seconds before retrieving final HTML. + Default: 0.1. + mean_delay (float): Mean base delay between requests when calling arun_many. + Default: 0.1. + max_range (float): Max random additional delay range for requests in arun_many. + Default: 0.3. + semaphore_count (int): Number of concurrent operations allowed. + Default: 5. + + # Page Interaction Parameters + js_code (str or list of str or None): JavaScript code/snippets to run on the page. + Default: None. + js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads. + Default: False. ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding. Default: True. - wait_for_images (bool): If True, wait for images to load before extracting content. - Default: True. - adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions. - Default: False. scan_full_page (bool): If True, scroll through the entire page to load all content. Default: False. scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True. @@ -244,163 +273,333 @@ class CrawlerRunConfig: Default: False. remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML. Default: False. - delay_before_return_html (float): Delay in seconds before retrieving final HTML. - Default: 0.1. - log_console (bool): If True, log console messages from the page. - Default: False. simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures. Default: False. override_navigator (bool): If True, overrides navigator properties for more human-like behavior. Default: False. magic (bool): If True, attempts automatic handling of overlays/popups. Default: False. + adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions. + Default: False. + + # Media Handling Parameters + screenshot (bool): Whether to take a screenshot after crawling. + Default: False. screenshot_wait_for (float or None): Additional wait time before taking a screenshot. Default: None. screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy. Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000). - mean_delay (float): Mean base delay between requests when calling arun_many. - Default: 0.1. - max_range (float): Max random additional delay range for requests in arun_many. - Default: 0.3. - # session_id and semaphore_count might be set at runtime, not needed as defaults here. + pdf (bool): Whether to generate a PDF of the page. + Default: False. + image_description_min_word_threshold (int): Minimum words for image description extraction. + Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50). + image_score_threshold (int): Minimum score threshold for processing an image. + Default: IMAGE_SCORE_THRESHOLD (e.g., 3). + exclude_external_images (bool): If True, exclude all external images from processing. + Default: False. + + # Link and Domain Handling Parameters + exclude_social_media_domains (list of str): List of domains to exclude for social media links. + Default: SOCIAL_MEDIA_DOMAINS (from config). + exclude_external_links (bool): If True, exclude all external links from the results. + Default: False. + exclude_social_media_links (bool): If True, exclude links pointing to social media domains. + Default: False. + exclude_domains (list of str): List of specific domains to exclude from results. + Default: []. + + # Debugging and Logging Parameters + verbose (bool): Enable verbose logging. + Default: True. + log_console (bool): If True, log console messages from the page. + Default: False. """ def __init__( self, - word_count_threshold: int = MIN_WORD_THRESHOLD , - extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None - chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None - markdown_generator : MarkdownGenerationStrategy = None, + # Content Processing Parameters + word_count_threshold: int = MIN_WORD_THRESHOLD, + extraction_strategy: ExtractionStrategy = None, + chunking_strategy: ChunkingStrategy = None, + markdown_generator: MarkdownGenerationStrategy = None, content_filter=None, + only_text: bool = False, + css_selector: str = None, + excluded_tags: list = None, + excluded_selector: str = None, + keep_data_attributes: bool = False, + remove_forms: bool = False, + prettiify: bool = False, + parser_type: str = "lxml", + + # SSL Parameters + fetch_ssl_certificate: bool = False, + + # Caching Parameters cache_mode=None, session_id: str = None, bypass_cache: bool = False, disable_cache: bool = False, no_cache_read: bool = False, no_cache_write: bool = False, - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - verbose: bool = True, - only_text: bool = False, - image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, - prettiify: bool = False, - js_code=None, - wait_for: str = None, - js_only: bool = False, + + # Page Navigation and Timing Parameters wait_until: str = "domcontentloaded", page_timeout: int = PAGE_TIMEOUT, - ignore_body_visibility: bool = True, + wait_for: str = None, wait_for_images: bool = True, - adjust_viewport_to_content: bool = False, + delay_before_return_html: float = 0.1, + mean_delay: float = 0.1, + max_range: float = 0.3, + semaphore_count: int = 5, + + # Page Interaction Parameters + js_code: Union[str, List[str]] = None, + js_only: bool = False, + ignore_body_visibility: bool = True, scan_full_page: bool = False, scroll_delay: float = 0.2, process_iframes: bool = False, remove_overlay_elements: bool = False, - delay_before_return_html: float = 0.1, - log_console: bool = False, simulate_user: bool = False, override_navigator: bool = False, magic: bool = False, + adjust_viewport_to_content: bool = False, + + # Media Handling Parameters + screenshot: bool = False, screenshot_wait_for: float = None, screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD, - mean_delay: float = 0.1, - max_range: float = 0.3, - semaphore_count: int = 5, + pdf: bool = False, + image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + image_score_threshold: int = IMAGE_SCORE_THRESHOLD, + exclude_external_images: bool = False, + + # Link and Domain Handling Parameters + exclude_social_media_domains: list = None, + exclude_external_links: bool = False, + exclude_social_media_links: bool = False, + exclude_domains: list = None, + + # Debugging and Logging Parameters + verbose: bool = True, + log_console: bool = False, + + url: str = None, ): + self.url = url + + # Content Processing Parameters self.word_count_threshold = word_count_threshold self.extraction_strategy = extraction_strategy self.chunking_strategy = chunking_strategy self.markdown_generator = markdown_generator self.content_filter = content_filter + self.only_text = only_text + self.css_selector = css_selector + self.excluded_tags = excluded_tags or [] + self.excluded_selector = excluded_selector or "" + self.keep_data_attributes = keep_data_attributes + self.remove_forms = remove_forms + self.prettiify = prettiify + self.parser_type = parser_type + + # SSL Parameters + self.fetch_ssl_certificate = fetch_ssl_certificate + + # Caching Parameters self.cache_mode = cache_mode self.session_id = session_id self.bypass_cache = bypass_cache self.disable_cache = disable_cache self.no_cache_read = no_cache_read self.no_cache_write = no_cache_write - self.css_selector = css_selector - self.screenshot = screenshot - self.pdf = pdf - self.verbose = verbose - self.only_text = only_text - self.image_description_min_word_threshold = image_description_min_word_threshold - self.prettiify = prettiify - self.js_code = js_code - self.wait_for = wait_for - self.js_only = js_only + + # Page Navigation and Timing Parameters self.wait_until = wait_until self.page_timeout = page_timeout - self.ignore_body_visibility = ignore_body_visibility + self.wait_for = wait_for self.wait_for_images = wait_for_images - self.adjust_viewport_to_content = adjust_viewport_to_content - self.scan_full_page = scan_full_page - self.scroll_delay = scroll_delay - self.process_iframes = process_iframes - self.remove_overlay_elements = remove_overlay_elements self.delay_before_return_html = delay_before_return_html - self.log_console = log_console - self.simulate_user = simulate_user - self.override_navigator = override_navigator - self.magic = magic - self.screenshot_wait_for = screenshot_wait_for - self.screenshot_height_threshold = screenshot_height_threshold self.mean_delay = mean_delay self.max_range = max_range self.semaphore_count = semaphore_count + # Page Interaction Parameters + self.js_code = js_code + self.js_only = js_only + self.ignore_body_visibility = ignore_body_visibility + self.scan_full_page = scan_full_page + self.scroll_delay = scroll_delay + self.process_iframes = process_iframes + self.remove_overlay_elements = remove_overlay_elements + self.simulate_user = simulate_user + self.override_navigator = override_navigator + self.magic = magic + self.adjust_viewport_to_content = adjust_viewport_to_content + + # Media Handling Parameters + self.screenshot = screenshot + self.screenshot_wait_for = screenshot_wait_for + self.screenshot_height_threshold = screenshot_height_threshold + self.pdf = pdf + self.image_description_min_word_threshold = image_description_min_word_threshold + self.image_score_threshold = image_score_threshold + self.exclude_external_images = exclude_external_images + + # Link and Domain Handling Parameters + self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS + self.exclude_external_links = exclude_external_links + self.exclude_social_media_links = exclude_social_media_links + self.exclude_domains = exclude_domains or [] + + # Debugging and Logging Parameters + self.verbose = verbose + self.log_console = log_console + # Validate type of extraction strategy and chunking strategy if they are provided - if self.extraction_strategy is not None and not isinstance(self.extraction_strategy, ExtractionStrategy): + if self.extraction_strategy is not None and not isinstance( + self.extraction_strategy, ExtractionStrategy + ): raise ValueError("extraction_strategy must be an instance of ExtractionStrategy") - if self.chunking_strategy is not None and not isinstance(self.chunking_strategy, ChunkingStrategy): + if self.chunking_strategy is not None and not isinstance( + self.chunking_strategy, ChunkingStrategy + ): raise ValueError("chunking_strategy must be an instance of ChunkingStrategy") # Set default chunking strategy if None if self.chunking_strategy is None: from .chunking_strategy import RegexChunking self.chunking_strategy = RegexChunking() - @staticmethod def from_kwargs(kwargs: dict) -> "CrawlerRunConfig": return CrawlerRunConfig( + # Content Processing Parameters word_count_threshold=kwargs.get("word_count_threshold", 200), extraction_strategy=kwargs.get("extraction_strategy"), chunking_strategy=kwargs.get("chunking_strategy"), markdown_generator=kwargs.get("markdown_generator"), content_filter=kwargs.get("content_filter"), + only_text=kwargs.get("only_text", False), + css_selector=kwargs.get("css_selector"), + excluded_tags=kwargs.get("excluded_tags", []), + excluded_selector=kwargs.get("excluded_selector", ""), + keep_data_attributes=kwargs.get("keep_data_attributes", False), + remove_forms=kwargs.get("remove_forms", False), + prettiify=kwargs.get("prettiify", False), + parser_type=kwargs.get("parser_type", "lxml"), + + # SSL Parameters + fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), + + # Caching Parameters cache_mode=kwargs.get("cache_mode"), session_id=kwargs.get("session_id"), bypass_cache=kwargs.get("bypass_cache", False), disable_cache=kwargs.get("disable_cache", False), no_cache_read=kwargs.get("no_cache_read", False), no_cache_write=kwargs.get("no_cache_write", False), - css_selector=kwargs.get("css_selector"), - screenshot=kwargs.get("screenshot", False), - pdf=kwargs.get("pdf", False), - verbose=kwargs.get("verbose", True), - only_text=kwargs.get("only_text", False), - image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD), - prettiify=kwargs.get("prettiify", False), - js_code=kwargs.get("js_code"), # If not provided here, will default inside constructor - wait_for=kwargs.get("wait_for"), - js_only=kwargs.get("js_only", False), + + # Page Navigation and Timing Parameters wait_until=kwargs.get("wait_until", "domcontentloaded"), page_timeout=kwargs.get("page_timeout", 60000), + wait_for=kwargs.get("wait_for"), + wait_for_images=kwargs.get("wait_for_images", True), + delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), + mean_delay=kwargs.get("mean_delay", 0.1), + max_range=kwargs.get("max_range", 0.3), + semaphore_count=kwargs.get("semaphore_count", 5), + + # Page Interaction Parameters + js_code=kwargs.get("js_code"), + js_only=kwargs.get("js_only", False), ignore_body_visibility=kwargs.get("ignore_body_visibility", True), - adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False), scan_full_page=kwargs.get("scan_full_page", False), scroll_delay=kwargs.get("scroll_delay", 0.2), process_iframes=kwargs.get("process_iframes", False), remove_overlay_elements=kwargs.get("remove_overlay_elements", False), - delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), - log_console=kwargs.get("log_console", False), simulate_user=kwargs.get("simulate_user", False), override_navigator=kwargs.get("override_navigator", False), magic=kwargs.get("magic", False), + adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False), + + # Media Handling Parameters + screenshot=kwargs.get("screenshot", False), screenshot_wait_for=kwargs.get("screenshot_wait_for"), - screenshot_height_threshold=kwargs.get("screenshot_height_threshold", 20000), - mean_delay=kwargs.get("mean_delay", 0.1), - max_range=kwargs.get("max_range", 0.3), - semaphore_count=kwargs.get("semaphore_count", 5) + screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD), + pdf=kwargs.get("pdf", False), + image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD), + image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD), + exclude_external_images=kwargs.get("exclude_external_images", False), + + # Link and Domain Handling Parameters + exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS), + exclude_external_links=kwargs.get("exclude_external_links", False), + exclude_social_media_links=kwargs.get("exclude_social_media_links", False), + exclude_domains=kwargs.get("exclude_domains", []), + + # Debugging and Logging Parameters + verbose=kwargs.get("verbose", True), + log_console=kwargs.get("log_console", False), + + url=kwargs.get("url"), ) + + # Create a funciton returns dict of the object + def to_dict(self): + return { + "word_count_threshold": self.word_count_threshold, + "extraction_strategy": self.extraction_strategy, + "chunking_strategy": self.chunking_strategy, + "markdown_generator": self.markdown_generator, + "content_filter": self.content_filter, + "only_text": self.only_text, + "css_selector": self.css_selector, + "excluded_tags": self.excluded_tags, + "excluded_selector": self.excluded_selector, + "keep_data_attributes": self.keep_data_attributes, + "remove_forms": self.remove_forms, + "prettiify": self.prettiify, + "parser_type": self.parser_type, + "fetch_ssl_certificate": self.fetch_ssl_certificate, + "cache_mode": self.cache_mode, + "session_id": self.session_id, + "bypass_cache": self.bypass_cache, + "disable_cache": self.disable_cache, + "no_cache_read": self.no_cache_read, + "no_cache_write": self.no_cache_write, + "wait_until": self.wait_until, + "page_timeout": self.page_timeout, + "wait_for": self.wait_for, + "wait_for_images": self.wait_for_images, + "delay_before_return_html": self.delay_before_return_html, + "mean_delay": self.mean_delay, + "max_range": self.max_range, + "semaphore_count": self.semaphore_count, + "js_code": self.js_code, + "js_only": self.js_only, + "ignore_body_visibility": self.ignore_body_visibility, + "scan_full_page": self.scan_full_page, + "scroll_delay": self.scroll_delay, + "process_iframes": self.process_iframes, + "remove_overlay_elements": self.remove_overlay_elements, + "simulate_user": self.simulate_user, + "override_navigator": self.override_navigator, + "magic": self.magic, + "adjust_viewport_to_content": self.adjust_viewport_to_content, + "screenshot": self.screenshot, + "screenshot_wait_for": self.screenshot_wait_for, + "screenshot_height_threshold": self.screenshot_height_threshold, + "pdf": self.pdf, + "image_description_min_word_threshold": self.image_description_min_word_threshold, + "image_score_threshold": self.image_score_threshold, + "exclude_external_images": self.exclude_external_images, + "exclude_social_media_domains": self.exclude_social_media_domains, + "exclude_external_links": self.exclude_external_links, + "exclude_social_media_links": self.exclude_social_media_links, + "exclude_domains": self.exclude_domains, + "verbose": self.verbose, + "log_console": self.log_console, + "url": self.url, + } diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3f040e13..32bd14b8 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -2,7 +2,7 @@ import asyncio import base64 import time from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable +from typing import Callable, Dict, Any, List, Optional, Awaitable, Union import os, sys, shutil import tempfile, subprocess from playwright.async_api import async_playwright, Page, Browser, Error, BrowserContext @@ -21,12 +21,9 @@ from .utils import get_error_context from .user_agent_generator import UserAgentGenerator from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT from .async_configs import BrowserConfig, CrawlerRunConfig +from .async_logger import AsyncLogger from playwright_stealth import StealthConfig, stealth_async - - -from io import BytesIO -import base64 -from PIL import Image, ImageDraw, ImageFont +from .ssl_certificate import SSLCertificate stealth_config = StealthConfig( webdriver=True, @@ -62,11 +59,64 @@ BROWSER_DISABLE_OPTIONS = [ "--metrics-recording-only", "--no-first-run", "--password-store=basic", - "--use-mock-keychain" + "--use-mock-keychain", ] + class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): + """ + Manages the browser process and context. This class allows to connect to the browser using CDP protocol. + + Attributes: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + browser_process (subprocess.Popen): The process object for the browser. + temp_dir (str): Temporary directory for user data if not provided. + debugging_port (int): Port for debugging the browser. + host (str): Host for debugging the browser. + + Methods: + start(): Starts the browser process and returns the CDP endpoint URL. + _get_browser_path(): Returns the browser executable path based on OS and browser type. + _get_browser_args(): Returns browser-specific command line arguments. + _get_user_data_dir(): Returns the user data directory path. + _cleanup(): Terminates the browser process and removes the temporary directory. + """ + + browser_type: str + user_data_dir: str + headless: bool + browser_process: subprocess.Popen + temp_dir: str + debugging_port: int + host: str + def __init__( + self, + browser_type: str = "chromium", + user_data_dir: Optional[str] = None, + headless: bool = False, + logger=None, + host: str = "localhost", + debugging_port: int = 9222, + ): + """ + Initialize the ManagedBrowser instance. + + Args: + browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". + Default: "chromium". + user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a + temporary directory may be used. Default: None. + headless (bool): Whether to run the browser in headless mode (no visible GUI). + Default: True. + logger (logging.Logger): Logger instance for logging messages. Default: None. + host (str): Host for debugging the browser. Default: "localhost". + debugging_port (int): Port for debugging the browser. Default: 9222. + """ self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless @@ -82,7 +132,7 @@ class ManagedBrowser: Starts the browser process and returns the CDP endpoint URL. If user_data_dir is not provided, creates a temporary directory. """ - + # Create temp dir if needed if not self.user_data_dir: self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") @@ -95,9 +145,7 @@ class ManagedBrowser: # Start browser process try: self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) # Monitor browser process output for errors asyncio.create_task(self._monitor_browser_process()) @@ -108,14 +156,24 @@ class ManagedBrowser: raise Exception(f"Failed to start browser: {e}") async def _monitor_browser_process(self): - """Monitor the browser process for unexpected termination.""" + """ + Monitor the browser process for unexpected termination. + + How it works: + 1. Read stdout and stderr from the browser process. + 2. If the process has terminated, log the error message and terminate the browser. + 3. If the shutting_down flag is set, log the normal termination message. + 4. If any other error occurs, log the error message. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ if self.browser_process: try: stdout, stderr = await asyncio.gather( asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read) + asyncio.to_thread(self.browser_process.stderr.read), ) - + # Check shutting_down flag BEFORE logging anything if self.browser_process.poll() is not None: if not self.shutting_down: @@ -125,22 +183,22 @@ class ManagedBrowser: params={ "code": self.browser_process.returncode, "stdout": stdout.decode(), - "stderr": stderr.decode() - } - ) + "stderr": stderr.decode(), + }, + ) await self.cleanup() else: self.logger.info( message="Browser process terminated normally | Code: {code}", tag="INFO", - params={"code": self.browser_process.returncode} + params={"code": self.browser_process.returncode}, ) except Exception as e: if not self.shutting_down: self.logger.error( message="Error monitoring browser process: {error}", tag="ERROR", - params={"error": str(e)} + params={"error": str(e)}, ) def _get_browser_path(self) -> str: @@ -149,27 +207,27 @@ class ManagedBrowser: paths = { "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" + "webkit": "/Applications/Safari.app/Contents/MacOS/Safari", } elif sys.platform == "win32": # Windows paths = { "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None # WebKit not supported on Windows + "webkit": None, # WebKit not supported on Windows } else: # Linux paths = { "chromium": "google-chrome", "firefox": "firefox", - "webkit": None # WebKit not supported on Linux + "webkit": None, # WebKit not supported on Linux } - + return paths.get(self.browser_type) def _get_browser_args(self) -> List[str]: """Returns browser-specific command line arguments""" base_args = [self._get_browser_path()] - + if self.browser_type == "chromium": args = [ f"--remote-debugging-port={self.debugging_port}", @@ -179,21 +237,23 @@ class ManagedBrowser: args.append("--headless=new") elif self.browser_type == "firefox": args = [ - "--remote-debugging-port", str(self.debugging_port), - "--profile", self.user_data_dir, + "--remote-debugging-port", + str(self.debugging_port), + "--profile", + self.user_data_dir, ] if self.headless: args.append("--headless") else: raise NotImplementedError(f"Browser type {self.browser_type} not supported") - + return base_args + args async def cleanup(self): """Cleanup browser process and temporary directory""" # Set shutting_down flag BEFORE any termination actions self.shutting_down = True - + if self.browser_process: try: self.browser_process.terminate() @@ -202,17 +262,17 @@ class ManagedBrowser: if self.browser_process.poll() is not None: break await asyncio.sleep(0.1) - + # Force kill if still running if self.browser_process.poll() is None: self.browser_process.kill() await asyncio.sleep(0.1) # Brief wait for kill to take effect - + except Exception as e: self.logger.error( message="Error terminating browser: {error}", tag="ERROR", - params={"error": str(e)} + params={"error": str(e)}, ) if self.temp_dir and os.path.exists(self.temp_dir): @@ -222,44 +282,70 @@ class ManagedBrowser: self.logger.error( message="Error removing temporary directory: {error}", tag="ERROR", - params={"error": str(e)} + params={"error": str(e)}, ) + class BrowserManager: + """ + Manages the browser instance and context. + + Attributes: + config (BrowserConfig): Configuration object containing all browser settings + logger: Logger instance for recording events and errors + browser (Browser): The browser instance + default_context (BrowserContext): The default browser context + managed_browser (ManagedBrowser): The managed browser instance + playwright (Playwright): The Playwright instance + sessions (dict): Dictionary to store session information + session_ttl (int): Session timeout in seconds + """ def __init__(self, browser_config: BrowserConfig, logger=None): """ Initialize the BrowserManager with a browser configuration. - + Args: browser_config (BrowserConfig): Configuration object containing all browser settings logger: Logger instance for recording events and errors """ - self.config = browser_config + self.config: BrowserConfig = browser_config self.logger = logger - + # Browser state self.browser = None self.default_context = None self.managed_browser = None self.playwright = None - + # Session management self.sessions = {} self.session_ttl = 1800 # 30 minutes - + # Initialize ManagedBrowser if needed if self.config.use_managed_browser: self.managed_browser = ManagedBrowser( browser_type=self.config.browser_type, user_data_dir=self.config.user_data_dir, headless=self.config.headless, - logger=self.logger + logger=self.logger, + debugging_port=self.config.debugging_port, ) async def start(self): - """Start the browser instance and set up the default context.""" + """ + Start the browser instance and set up the default context. + + How it works: + 1. Check if Playwright is already initialized. + 2. If not, initialize Playwright. + 3. If managed browser is used, start it and connect to the CDP endpoint. + 4. If managed browser is not used, launch the browser and set up the default context. + + Note: This method should be called in a separate task to avoid blocking the main event loop. + """ if self.playwright is None: from playwright.async_api import async_playwright + self.playwright = await async_playwright().start() if self.config.use_managed_browser: @@ -269,18 +355,24 @@ class BrowserManager: if contexts: self.default_context = contexts[0] else: - self.default_context = await self.browser.new_context( - viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, - storage_state=self.config.storage_state, - user_agent=self.config.headers.get("User-Agent", self.config.user_agent), - accept_downloads=self.config.accept_downloads, - ignore_https_errors=self.config.ignore_https_errors, - java_script_enabled=self.config.java_script_enabled - ) + self.default_context = await self.create_browser_context() + # self.default_context = await self.browser.new_context( + # viewport={ + # "width": self.config.viewport_width, + # "height": self.config.viewport_height, + # }, + # storage_state=self.config.storage_state, + # user_agent=self.config.headers.get( + # "User-Agent", self.config.user_agent + # ), + # accept_downloads=self.config.accept_downloads, + # ignore_https_errors=self.config.ignore_https_errors, + # java_script_enabled=self.config.java_script_enabled, + # ) await self.setup_context(self.default_context) else: browser_args = self._build_browser_args() - + # Launch appropriate browser type if self.config.browser_type == "firefox": self.browser = await self.playwright.firefox.launch(**browser_args) @@ -294,6 +386,9 @@ class BrowserManager: def _build_browser_args(self) -> dict: """Build browser launch arguments from config.""" args = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", "--no-sandbox", "--disable-dev-shm-usage", "--no-first-run", @@ -304,47 +399,94 @@ class BrowserManager: "--ignore-certificate-errors-spki-list", "--disable-blink-features=AutomationControlled", "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + # "--single-process", f"--window-size={self.config.viewport_width},{self.config.viewport_height}", ] if self.config.light_mode: args.extend(BROWSER_DISABLE_OPTIONS) - if self.config.text_only: - args.extend(['--blink-settings=imagesEnabled=false', '--disable-remote-fonts']) + if self.config.text_mode: + args.extend( + [ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ] + ) if self.config.extra_args: args.extend(self.config.extra_args) - browser_args = { - "headless": self.config.headless, - "args": args - } + browser_args = {"headless": self.config.headless, "args": args} if self.config.chrome_channel: browser_args["channel"] = self.config.chrome_channel if self.config.accept_downloads: - browser_args["downloads_path"] = (self.config.downloads_path or - os.path.join(os.getcwd(), "downloads")) + browser_args["downloads_path"] = self.config.downloads_path or os.path.join( + os.getcwd(), "downloads" + ) os.makedirs(browser_args["downloads_path"], exist_ok=True) if self.config.proxy or self.config.proxy_config: from playwright.async_api import ProxySettings + proxy_settings = ( - ProxySettings(server=self.config.proxy) if self.config.proxy else - ProxySettings( + ProxySettings(server=self.config.proxy) + if self.config.proxy + else ProxySettings( server=self.config.proxy_config.get("server"), username=self.config.proxy_config.get("username"), - password=self.config.proxy_config.get("password") + password=self.config.proxy_config.get("password"), ) ) browser_args["proxy"] = proxy_settings return browser_args - async def setup_context(self, context: BrowserContext, is_default=False): - """Set up a browser context with the configured options.""" + async def setup_context( + self, + context: BrowserContext, + crawlerRunConfig: CrawlerRunConfig, + is_default=False, + ): + """ + Set up a browser context with the configured options. + + How it works: + 1. Set extra HTTP headers if provided. + 2. Add cookies if provided. + 3. Load storage state if provided. + 4. Accept downloads if enabled. + 5. Set default timeouts for navigation and download. + 6. Set user agent if provided. + 7. Set browser hints if provided. + 8. Set proxy if provided. + 9. Set downloads path if provided. + 10. Set storage state if provided. + 11. Set cache if provided. + 12. Set extra HTTP headers if provided. + 13. Add cookies if provided. + 14. Set default timeouts for navigation and download if enabled. + 15. Set user agent if provided. + 16. Set browser hints if provided. + + Args: + context (BrowserContext): The browser context to set up + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + is_default (bool): Flag indicating if this is the default context + Returns: + None + """ if self.config.headers: await context.set_extra_http_headers(self.config.headers) @@ -359,48 +501,135 @@ class BrowserManager: context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) if self.config.downloads_path: context._impl_obj._options["accept_downloads"] = True - context._impl_obj._options["downloads_path"] = self.config.downloads_path + context._impl_obj._options["downloads_path"] = ( + self.config.downloads_path + ) # Handle user agent and browser hints if self.config.user_agent: combined_headers = { "User-Agent": self.config.user_agent, - "sec-ch-ua": self.config.browser_hint + "sec-ch-ua": self.config.browser_hint, } combined_headers.update(self.config.headers) await context.set_extra_http_headers(combined_headers) - async def get_page(self, session_id: Optional[str], user_agent: str): - """Get a page for the given session ID, creating a new one if needed.""" + # Add default cookie + await context.add_cookies( + [{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}] + ) + + # Handle navigator overrides + if ( + crawlerRunConfig.override_navigator + or crawlerRunConfig.simulate_user + or crawlerRunConfig.magic + ): + await context.add_init_script(load_js_script("navigator_overrider")) + + async def create_browser_context(self): + """ + Creates and returns a new browser context with configured settings. + Applies text-only mode settings if text_mode is enabled in config. + + Returns: + Context: Browser context object with the specified configurations + """ + # Base settings + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + viewport_settings = { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + } + proxy_settings = {"server": self.config.proxy} if self.config.proxy else None + + blocked_extensions = [ + # Images + 'jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'psd', + # Fonts + 'woff', 'woff2', 'ttf', 'otf', 'eot', + # Styles + # 'css', 'less', 'scss', 'sass', + # Media + 'mp4', 'webm', 'ogg', 'avi', 'mov', 'wmv', 'flv', 'm4v', + 'mp3', 'wav', 'aac', 'm4a', 'opus', 'flac', + # Documents + 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx', + # Archives + 'zip', 'rar', '7z', 'tar', 'gz', + # Scripts and data + 'xml', 'swf', 'wasm' + ] + + # Common context settings + context_settings = { + "user_agent": user_agent, + "viewport": viewport_settings, + "proxy": proxy_settings, + "accept_downloads": self.config.accept_downloads, + "storage_state": self.config.storage_state, + "ignore_https_errors": self.config.ignore_https_errors, + "device_scale_factor": 1.0, + "java_script_enabled": self.config.java_script_enabled, + } + + if self.config.text_mode: + text_mode_settings = { + "has_touch": False, + "is_mobile": False, + } + # Update context settings with text mode settings + context_settings.update(text_mode_settings) + + # Create and return the context with all settings + context = await self.browser.new_context(**context_settings) + + # Apply text mode settings if enabled + if self.config.text_mode: + # Create and apply route patterns for each extension + for ext in blocked_extensions: + await context.route(f"**/*.{ext}", lambda route: route.abort()) + return context + + # async def get_page(self, session_id: Optional[str], user_agent: str): + async def get_page(self, crawlerRunConfig: CrawlerRunConfig): + """ + Get a page for the given session ID, creating a new one if needed. + + Args: + crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings + + Returns: + Page: The page object for the given session ID. + BrowserContext: The browser context for the given session ID. + """ self._cleanup_expired_sessions() - if session_id and session_id in self.sessions: - context, page, _ = self.sessions[session_id] - self.sessions[session_id] = (context, page, time.time()) + if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions: + context, page, _ = self.sessions[crawlerRunConfig.session_id] + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) return page, context if self.config.use_managed_browser: context = self.default_context page = await context.new_page() else: - context = await self.browser.new_context( - user_agent=user_agent, - viewport={"width": self.config.viewport_width, "height": self.config.viewport_height}, - proxy={"server": self.config.proxy} if self.config.proxy else None, - accept_downloads=self.config.accept_downloads, - storage_state=self.config.storage_state, - ignore_https_errors=self.config.ignore_https_errors - ) - await self.setup_context(context) + context = await self.create_browser_context() + await self.setup_context(context, crawlerRunConfig) page = await context.new_page() - if session_id: - self.sessions[session_id] = (context, page, time.time()) + if crawlerRunConfig.session_id: + self.sessions[crawlerRunConfig.session_id] = (context, page, time.time()) return page, context async def kill_session(self, session_id: str): - """Kill a browser session and clean up resources.""" + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The session ID to kill. + """ if session_id in self.sessions: context, page, _ = self.sessions[session_id] await page.close() @@ -412,7 +641,8 @@ class BrowserManager: """Clean up expired sessions based on TTL.""" current_time = time.time() expired_sessions = [ - sid for sid, (_, _, last_used) in self.sessions.items() + sid + for sid, (_, _, last_used) in self.sessions.items() if current_time - last_used > self.session_ttl ] for sid in expired_sessions: @@ -422,7 +652,7 @@ class BrowserManager: """Close all browser resources and clean up.""" if self.config.sleep_on_close: await asyncio.sleep(0.5) - + session_ids = list(self.sessions.keys()) for session_id in session_ids: await self.kill_session(session_id) @@ -440,32 +670,52 @@ class BrowserManager: await self.playwright.stop() self.playwright = None + class AsyncCrawlerStrategy(ABC): + """ + Abstract base class for crawler strategies. + Subclasses must implement the crawl method. + """ @abstractmethod async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: - pass # 4 + 3 - - @abstractmethod - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - pass - - @abstractmethod - async def take_screenshot(self, **kwargs) -> str: - pass - - @abstractmethod - def update_user_agent(self, user_agent: str): - pass - - @abstractmethod - def set_hook(self, hook_type: str, hook: Callable): - pass + pass # 4 + 3 + + class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, browser_config: BrowserConfig = None, logger = None, **kwargs): + """ + Crawler strategy using Playwright. + + Attributes: + browser_config (BrowserConfig): Configuration object containing browser settings. + logger (AsyncLogger): Logger instance for recording events and errors. + _downloaded_files (List[str]): List of downloaded file paths. + hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior. + browser_manager (BrowserManager): Manager for browser creation and management. + + Methods: + __init__(self, browser_config=None, logger=None, **kwargs): + Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. + __aenter__(self): + Start the browser and initialize the browser manager. + __aexit__(self, exc_type, exc_val, exc_tb): + Close the browser and clean up resources. + start(self): + Start the browser and initialize the browser manager. + close(self): + Close the browser and clean up resources. + kill_session(self, session_id): + Kill a browser session and clean up resources. + crawl(self, url, **kwargs): + Run the crawler for a single URL. + + """ + def __init__( + self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs + ): """ Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. - + Args: browser_config (BrowserConfig): Configuration object containing browser settings. If None, will be created from kwargs for backwards compatibility. @@ -475,25 +725,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Initialize browser config, either from provided object or kwargs self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs) self.logger = logger - + # Initialize session management self._downloaded_files = [] - + # Initialize hooks system self.hooks = { - 'on_browser_created': None, - 'on_user_agent_updated': None, - 'on_execution_started': None, - 'before_goto': None, - 'after_goto': None, - 'before_return_html': None, - 'before_retrieve_html': None + "on_browser_created": None, + "on_page_context_created": None, + "on_user_agent_updated": None, + "on_execution_started": None, + "before_goto": None, + "after_goto": None, + "before_return_html": None, + "before_retrieve_html": None, } - + # Initialize browser manager with config self.browser_manager = BrowserManager( - browser_config=self.browser_config, - logger=self.logger + browser_config=self.browser_config, logger=self.logger ) async def __aenter__(self): @@ -504,27 +754,77 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.close() async def start(self): + """ + Start the browser and initialize the browser manager. + """ await self.browser_manager.start() - await self.execute_hook('on_browser_created', self.browser_manager.browser, context = self.browser_manager.default_context) - + await self.execute_hook( + "on_browser_created", + self.browser_manager.browser, + context=self.browser_manager.default_context, + ) + async def close(self): + """ + Close the browser and clean up resources. + """ await self.browser_manager.close() - + async def kill_session(self, session_id: str): + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The ID of the session to kill. + + Returns: + None + """ # Log a warning message and no need kill session, in new version auto kill session self.logger.warning( message="Session auto-kill is enabled in the new version. No need to manually kill sessions.", - tag="WARNING" + tag="WARNING", ) await self.browser_manager.kill_session(session_id) def set_hook(self, hook_type: str, hook: Callable): + """ + Set a hook function for a specific hook type. Following are list of hook types: + - on_browser_created: Called when a new browser instance is created. + - on_page_context_created: Called when a new page context is created. + - on_user_agent_updated: Called when the user agent is updated. + - on_execution_started: Called when the execution starts. + - before_goto: Called before a goto operation. + - after_goto: Called after a goto operation. + - before_return_html: Called before returning HTML content. + - before_retrieve_html: Called before retrieving HTML content. + + All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs. + + Args: + hook_type (str): The type of the hook. + hook (Callable): The hook function to set. + + Returns: + None + """ if hook_type in self.hooks: self.hooks[hook_type] = hook else: raise ValueError(f"Invalid hook type: {hook_type}") async def execute_hook(self, hook_type: str, *args, **kwargs): + """ + Execute a hook function for a specific hook type. + + Args: + hook_type (str): The type of the hook. + *args: Variable length positional arguments. + **kwargs: Keyword arguments. + + Returns: + The return value of the hook function, if any. + """ hook = self.hooks.get(hook_type) if hook: if asyncio.iscoroutinefunction(hook): @@ -534,31 +834,68 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return args[0] if args else None def update_user_agent(self, user_agent: str): + """ + Update the user agent for the browser. + + Args: + user_agent (str): The new user agent string. + + Returns: + None + """ self.user_agent = user_agent def set_custom_headers(self, headers: Dict[str, str]): - self.headers = headers - - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): - wait_for = wait_for.strip() + """ + Set custom headers for the browser. - if wait_for.startswith('js:'): + Args: + headers (Dict[str, str]): A dictionary of headers to set. + + Returns: + None + """ + self.headers = headers + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + """ + Wait for a condition in a smart way. This functions works as below: + + 1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true. + 2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present. + 3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true. + 4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present. + + This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl(). + Args: + page: Playwright page object + wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'. + timeout (float): Maximum time to wait in milliseconds + + Returns: + None + """ + wait_for = wait_for.strip() + + if wait_for.startswith("js:"): # Explicitly specified JavaScript js_code = wait_for[3:].strip() return await self.csp_compliant_wait(page, js_code, timeout) - elif wait_for.startswith('css:'): + elif wait_for.startswith("css:"): # Explicitly specified CSS selector css_selector = wait_for[4:].strip() try: await page.wait_for_selector(css_selector, timeout=timeout) except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'") + if "Timeout" in str(e): + raise TimeoutError( + f"Timeout after {timeout}ms waiting for selector '{css_selector}'" + ) else: raise ValueError(f"Invalid CSS selector: '{css_selector}'") else: # Auto-detect based on content - if wait_for.startswith('()') or wait_for.startswith('function'): + if wait_for.startswith("()") or wait_for.startswith("function"): # It's likely a JavaScript function return await self.csp_compliant_wait(page, wait_for, timeout) else: @@ -566,67 +903,107 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): try: await page.wait_for_selector(wait_for, timeout=timeout) except Error as e: - if 'Timeout' in str(e): - raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'") + if "Timeout" in str(e): + raise TimeoutError( + f"Timeout after {timeout}ms waiting for selector '{wait_for}'" + ) else: # If it's not a timeout error, it might be an invalid selector # Let's try to evaluate it as a JavaScript function as a fallback try: - return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout) + return await self.csp_compliant_wait( + page, f"() => {{{wait_for}}}", timeout + ) except Error: - raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. " - "It should be either a valid CSS selector, a JavaScript function, " - "or explicitly prefixed with 'js:' or 'css:'.") - - async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000): + raise ValueError( + f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'." + ) + + async def csp_compliant_wait( self, page: Page, user_wait_function: str, timeout: float = 30000 ): + """ + Wait for a condition in a CSP-compliant way. + + Args: + page: Playwright page object + user_wait_function: JavaScript function as string that returns boolean + timeout: Maximum time to wait in milliseconds + + Returns: + bool: True if condition was met, False if timed out + + Raises: + RuntimeError: If there's an error evaluating the condition + """ wrapper_js = f""" async () => {{ const userFunction = {user_wait_function}; const startTime = Date.now(); - while (true) {{ - if (await userFunction()) {{ - return true; + try {{ + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + return false; // Return false instead of throwing + }} + await new Promise(resolve => setTimeout(resolve, 100)); }} - if (Date.now() - startTime > {timeout}) {{ - throw new Error('Timeout waiting for condition'); - }} - await new Promise(resolve => setTimeout(resolve, 100)); + }} catch (error) {{ + throw new Error(`Error evaluating condition: ${{error.message}}`); }} }} """ - + try: - await page.evaluate(wrapper_js) - except TimeoutError: - raise TimeoutError(f"Timeout after {timeout}ms waiting for condition") + result = await page.evaluate(wrapper_js) + return result except Exception as e: - raise RuntimeError(f"Error in wait condition: {str(e)}") + if "Error evaluating condition" in str(e): + raise RuntimeError(f"Failed to evaluate wait condition: {str(e)}") + # For timeout or other cases, just return False + return False async def process_iframes(self, page): - # Find all iframes - iframes = await page.query_selector_all('iframe') + """ + Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content. + Args: + page: Playwright page object + + Returns: + Playwright page object + """ + # Find all iframes + iframes = await page.query_selector_all("iframe") + for i, iframe in enumerate(iframes): try: # Add a unique identifier to the iframe await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') - + # Get the frame associated with this iframe frame = await iframe.content_frame() - + if frame: # Wait for the frame to load - await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout - + await frame.wait_for_load_state( + "load", timeout=30000 + ) # 30 seconds timeout + # Extract the content of the iframe's body - iframe_content = await frame.evaluate('() => document.body.innerHTML') - + iframe_content = await frame.evaluate( + "() => document.body.innerHTML" + ) + # Generate a unique class name for this iframe - class_name = f'extracted-iframe-content-{i}' - + class_name = f"extracted-iframe-content-{i}" + # Replace the iframe with a div containing the extracted content - _iframe = iframe_content.replace('`', '\\`') - await page.evaluate(f""" + _iframe = iframe_content.replace("`", "\\`") + await page.evaluate( + f""" () => {{ const iframe = document.getElementById('iframe-{i}'); const div = document.createElement('div'); @@ -634,35 +1011,45 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): div.className = '{class_name}'; iframe.replaceWith(div); }} - """) + """ + ) else: self.logger.warning( message="Could not access content frame for iframe {index}", tag="SCRAPE", - params={"index": i} - ) + params={"index": i}, + ) except Exception as e: self.logger.error( message="Error processing iframe {index}: {error}", tag="ERROR", - params={"index": i, "error": str(e)} - ) + params={"index": i, "error": str(e)}, + ) # Return the page object - return page - + return page + async def create_session(self, **kwargs) -> str: - """Creates a new browser session and returns its ID.""" + """ + Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls. + This function is asynchronous and returns a string representing the session ID. + + Args: + **kwargs: Optional keyword arguments to configure the session. + + Returns: + str: The session ID. + """ await self.start() - - session_id = kwargs.get('session_id') or str(uuid.uuid4()) - + + session_id = kwargs.get("session_id") or str(uuid.uuid4()) + user_agent = kwargs.get("user_agent", self.user_agent) # Use browser_manager to get a fresh page & context assigned to this session_id page, context = await self.browser_manager.get_page(session_id, user_agent) return session_id - - async def crawl(self, url: str, config: CrawlerRunConfig, **kwargs) -> AsyncCrawlResponse: + + async def crawl( self, url: str, config: CrawlerRunConfig, **kwargs ) -> AsyncCrawlResponse: """ Crawls a given URL or processes raw HTML/local file content based on the URL prefix. @@ -670,7 +1057,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): url (str): The URL to crawl. Supported prefixes: - 'http://' or 'https://': Web URL to crawl. - 'file://': Local file path to process. - - 'raw:': Raw HTML content to process. + - 'raw://': Raw HTML content to process. **kwargs: Additional parameters: - 'screenshot' (bool): Whether to take a screenshot. - ... [other existing parameters] @@ -683,15 +1070,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): status_code = 200 # Default for local/raw HTML screenshot_data = None - if url.startswith(('http://', 'https://')): + if url.startswith(("http://", "https://")): return await self._crawl_web(url, config) - elif url.startswith('file://'): + elif url.startswith("file://"): # Process local file local_file_path = url[7:] # Remove 'file://' prefix if not os.path.exists(local_file_path): raise FileNotFoundError(f"Local file not found: {local_file_path}") - with open(local_file_path, 'r', encoding='utf-8') as f: + with open(local_file_path, "r", encoding="utf-8") as f: html = f.read() if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) @@ -700,12 +1087,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, - get_delayed_content=None + get_delayed_content=None, ) - elif url.startswith('raw:'): + elif url.startswith("raw:") or url.startswith("raw://"): # Process raw HTML content - raw_html = url[4:] # Remove 'raw:' prefix + raw_html = url[4:] if url[:4] == "raw:" else url[7:] html = raw_html if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) @@ -714,160 +1101,257 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, - get_delayed_content=None + get_delayed_content=None, ) else: - raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") + raise ValueError( + "URL must start with 'http://', 'https://', 'file://', or 'raw:'" + ) - async def _crawl_web(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse: + async def _crawl_web( self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse: """ Internal method to crawl web URLs with the specified configuration. - + Args: url (str): The web URL to crawl config (CrawlerRunConfig): Configuration object controlling the crawl behavior - + Returns: AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data """ + config.url = url response_headers = {} status_code = None - + # Reset downloaded files list for new crawl self._downloaded_files = [] - + # Handle user agent with magic mode user_agent = self.browser_config.user_agent if config.magic and self.browser_config.user_agent_mode != "random": - user_agent = UserAgentGenerator().generate( + self.browser_config.user_agent = UserAgentGenerator().generate( **(self.browser_config.user_agent_generator_config or {}) ) - + # Get page for session - page, context = await self.browser_manager.get_page( - session_id=config.session_id, - user_agent=user_agent - ) - + page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + # Add default cookie - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) - + await context.add_cookies( + [{"name": "cookiesEnabled", "value": "true", "url": url}] + ) + # Handle navigator overrides if config.override_navigator or config.simulate_user or config.magic: await context.add_init_script(load_js_script("navigator_overrider")) - + + # Call hook after page creation + await self.execute_hook("on_page_context_created", page, context=context) + # Set up console logging if requested if config.log_console: - page.on("console", lambda msg: self.logger.debug( - message="Console: {msg}", - tag="CONSOLE", - params={"msg": msg.text} - )) - page.on("pageerror", lambda exc: self.logger.error( - message="Page error: {exc}", - tag="ERROR", - params={"exc": exc} - )) - + + def log_consol( + msg, console_log_type="debug" + ): # Corrected the parameter syntax + if console_log_type == "error": + self.logger.error( + message=f"Console error: {msg}", # Use f-string for variable interpolation + tag="CONSOLE", + params={"msg": msg.text}, + ) + elif console_log_type == "debug": + self.logger.debug( + message=f"Console: {msg}", # Use f-string for variable interpolation + tag="CONSOLE", + params={"msg": msg.text}, + ) + + page.on("console", log_consol) + page.on("pageerror", lambda e: log_consol(e, "error")) + try: + # Get SSL certificate information if requested and URL is HTTPS + ssl_cert = None + if config.fetch_ssl_certificate: + ssl_cert = SSLCertificate.from_url(url) + # Set up download handling if self.browser_config.accept_downloads: - page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) + page.on( + "download", + lambda download: asyncio.create_task( + self._handle_download(download) + ), + ) # Handle page navigation and content loading if not config.js_only: - await self.execute_hook('before_goto', page, context=context) + await self.execute_hook("before_goto", page, context=context, url=url) try: + # Generate a unique nonce for this request + nonce = hashlib.sha256(os.urandom(32)).hexdigest() + + # Add CSP headers to the request + await page.set_extra_http_headers({ + 'Content-Security-Policy': f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" + }) + response = await page.goto( - url, - wait_until=config.wait_until, - timeout=config.page_timeout + url, wait_until=config.wait_until, timeout=config.page_timeout ) except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") - - await self.execute_hook('after_goto', page, context=context) - - status_code = response.status - response_headers = response.headers + + await self.execute_hook("after_goto", page, context=context, url=url, response=response) + + if response is None: + status_code = 200 + response_headers = {} + else: + status_code = response.status + response_headers = response.headers + else: status_code = 200 response_headers = {} # Wait for body element and visibility try: - await page.wait_for_selector('body', state='attached', timeout=30000) - await page.wait_for_function(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0'; - } - """, timeout=30000) + await page.wait_for_selector("body", state="attached", timeout=30000) + + # Use the new check_visibility function with csp_compliant_wait + is_visible = await self.csp_compliant_wait( + page, + """() => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + }""", + timeout=30000 + ) + + if not is_visible and not config.ignore_body_visibility: + visibility_info = await self.check_visibility(page) + raise Error(f"Body element is hidden: {visibility_info}") + except Error as e: - visibility_info = await page.evaluate(""" - () => { - const body = document.body; - const style = window.getComputedStyle(body); - return { - display: style.display, - visibility: style.visibility, - opacity: style.opacity, - hasContent: body.innerHTML.length, - classList: Array.from(body.classList) - } - } - """) + visibility_info = await self.check_visibility(page) if self.config.verbose: self.logger.debug( message="Body visibility info: {info}", tag="DEBUG", - params={"info": visibility_info} + params={"info": visibility_info}, ) - + if not config.ignore_body_visibility: - raise Error(f"Body element is hidden: {visibility_info}") + raise Error(f"Body element is hidden: {visibility_info}") + + + # try: + # await page.wait_for_selector("body", state="attached", timeout=30000) + + # await page.wait_for_function( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return style.display !== 'none' && + # style.visibility !== 'hidden' && + # style.opacity !== '0'; + # } + # """, + # timeout=30000, + # ) + # except Error as e: + # visibility_info = await page.evaluate( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return { + # display: style.display, + # visibility: style.visibility, + # opacity: style.opacity, + # hasContent: body.innerHTML.length, + # classList: Array.from(body.classList) + # } + # } + # """ + # ) + + # if self.config.verbose: + # self.logger.debug( + # message="Body visibility info: {info}", + # tag="DEBUG", + # params={"info": visibility_info}, + # ) + + # if not config.ignore_body_visibility: + # raise Error(f"Body element is hidden: {visibility_info}") # Handle content loading and viewport adjustment - if not self.browser_config.text_only and (config.wait_for_images or config.adjust_viewport_to_content): + if not self.browser_config.text_mode and ( + config.wait_for_images or config.adjust_viewport_to_content + ): await page.wait_for_load_state("domcontentloaded") await asyncio.sleep(0.1) - try: - await page.wait_for_function( - "Array.from(document.images).every(img => img.complete)", - timeout=1000 + + # Check for image loading with improved error handling + images_loaded = await self.csp_compliant_wait( + page, + "() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)", + timeout=1000 + ) + + if not images_loaded and self.logger: + self.logger.warning( + message="Some images failed to load within timeout", + tag="SCRAPE", ) - except PlaywrightTimeoutError: - pass # Adjust viewport if needed - if not self.browser_config.text_only and config.adjust_viewport_to_content: + if not self.browser_config.text_mode and config.adjust_viewport_to_content: try: - page_width = await page.evaluate("document.documentElement.scrollWidth") - page_height = await page.evaluate("document.documentElement.scrollHeight") - + dimensions = await self.get_page_dimensions(page) + page_height = dimensions['height'] + page_width = dimensions['width'] + # page_width = await page.evaluate( + # "document.documentElement.scrollWidth" + # ) + # page_height = await page.evaluate( + # "document.documentElement.scrollHeight" + # ) + target_width = self.browser_config.viewport_width target_height = int(target_width * page_width / page_height * 0.95) - await page.set_viewport_size({"width": target_width, "height": target_height}) + await page.set_viewport_size( + {"width": target_width, "height": target_height} + ) scale = min(target_width / page_width, target_height / page_height) cdp = await page.context.new_cdp_session(page) - await cdp.send('Emulation.setDeviceMetricsOverride', { - 'width': page_width, - 'height': page_height, - 'deviceScaleFactor': 1, - 'mobile': False, - 'scale': scale - }) + await cdp.send( + "Emulation.setDeviceMetricsOverride", + { + "width": page_width, + "height": page_height, + "deviceScaleFactor": 1, + "mobile": False, + "scale": scale, + }, + ) except Exception as e: self.logger.warning( message="Failed to adjust viewport to content: {error}", tag="VIEWPORT", - params={"error": str(e)} + params={"error": str(e)}, ) # Handle full page scanning @@ -875,31 +1359,43 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self._handle_full_page_scan(page, config.scroll_delay) # Execute JavaScript if provided + # if config.js_code: + # if isinstance(config.js_code, str): + # await page.evaluate(config.js_code) + # elif isinstance(config.js_code, list): + # for js in config.js_code: + # await page.evaluate(js) + if config.js_code: - if isinstance(config.js_code, str): - await page.evaluate(config.js_code) - elif isinstance(config.js_code, list): - for js in config.js_code: - await page.evaluate(js) - - await self.execute_hook('on_execution_started', page, context=context) + # execution_result = await self.execute_user_script(page, config.js_code) + execution_result = await self.robust_execute_user_script(page, config.js_code) + if not execution_result["success"]: + self.logger.warning( + message="User script execution had issues: {error}", + tag="JS_EXEC", + params={"error": execution_result.get("error")} + ) + + await self.execute_hook("on_execution_started", page, context=context) # Handle user simulation if config.simulate_user or config.magic: await page.mouse.move(100, 100) await page.mouse.down() await page.mouse.up() - await page.keyboard.press('ArrowDown') + await page.keyboard.press("ArrowDown") # Handle wait_for condition if config.wait_for: try: - await self.smart_wait(page, config.wait_for, timeout=config.page_timeout) + await self.smart_wait( + page, config.wait_for, timeout=config.page_timeout + ) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") # Update image dimensions if needed - if not self.browser_config.text_only: + if not self.browser_config.text_mode: update_image_dimensions_js = load_js_script("update_image_dimensions") try: try: @@ -911,7 +1407,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.logger.error( message="Error updating image dimensions: {error}", tag="ERROR", - params={"error": str(e)} + params={"error": str(e)}, ) # Process iframes if needed @@ -919,7 +1415,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page = await self.process_iframes(page) # Pre-content retrieval hooks and delay - await self.execute_hook('before_retrieve_html', page, context=context) + await self.execute_hook("before_retrieve_html", page, context=context) if config.delay_before_return_html: await asyncio.sleep(config.delay_before_return_html) @@ -929,7 +1425,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Get final HTML content html = await page.content() - await self.execute_hook('before_return_html', page, html, context=context) + await self.execute_hook("before_return_html", page = page, html = html, context=context) # Handle PDF and screenshot generation start_export_time = time.perf_counter() @@ -943,25 +1439,23 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.screenshot_wait_for: await asyncio.sleep(config.screenshot_wait_for) screenshot_data = await self.take_screenshot( - page, - screenshot_height_threshold=config.screenshot_height_threshold + page, screenshot_height_threshold=config.screenshot_height_threshold ) if screenshot_data or pdf_data: self.logger.info( message="Exporting PDF and taking screenshot took {duration:.2f}s", tag="EXPORT", - params={"duration": time.perf_counter() - start_export_time} + params={"duration": time.perf_counter() - start_export_time}, ) # Define delayed content getter async def get_delayed_content(delay: float = 5.0) -> str: - if self.config.verbose: - self.logger.info( - message="Waiting for {delay} seconds before retrieving content for {url}", - tag="INFO", - params={"delay": delay, "url": url} - ) + self.logger.info( + message="Waiting for {delay} seconds before retrieving content for {url}", + tag="INFO", + params={"delay": delay, "url": url}, + ) await asyncio.sleep(delay) return await page.content() @@ -973,56 +1467,101 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): screenshot=screenshot_data, pdf_data=pdf_data, get_delayed_content=get_delayed_content, - downloaded_files=self._downloaded_files if self._downloaded_files else None + ssl_certificate=ssl_cert, + downloaded_files=( + self._downloaded_files if self._downloaded_files else None + ), ) except Exception as e: raise e async def _handle_full_page_scan(self, page: Page, scroll_delay: float): - """Helper method to handle full page scanning""" + """ + Helper method to handle full page scanning. + + How it works: + 1. Get the viewport height. + 2. Scroll to the bottom of the page. + 3. Get the total height of the page. + 4. Scroll back to the top of the page. + 5. Scroll to the bottom of the page again. + 6. Continue scrolling until the bottom of the page is reached. + + Args: + page (Page): The Playwright page object + scroll_delay (float): The delay between page scrolls + + """ try: - viewport_height = page.viewport_size.get("height", self.browser_config.viewport_height) + viewport_height = page.viewport_size.get( + "height", self.browser_config.viewport_height + ) current_position = viewport_height - - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) - - total_height = await page.evaluate("document.documentElement.scrollHeight") + + # await page.evaluate(f"window.scrollTo(0, {current_position})") + await self.safe_scroll(page, 0, current_position) + # await self.csp_scroll_to(page, 0, current_position) + # await asyncio.sleep(scroll_delay) + + # total_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + total_height = dimensions['height'] while current_position < total_height: current_position = min(current_position + viewport_height, total_height) - await page.evaluate(f"window.scrollTo(0, {current_position})") - await asyncio.sleep(scroll_delay) + await self.safe_scroll(page, 0, current_position) + # await page.evaluate(f"window.scrollTo(0, {current_position})") + # await asyncio.sleep(scroll_delay) + + # new_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + new_height = dimensions['height'] - new_height = await page.evaluate("document.documentElement.scrollHeight") if new_height > total_height: total_height = new_height - - await page.evaluate("window.scrollTo(0, 0)") - + + # await page.evaluate("window.scrollTo(0, 0)") + await self.safe_scroll(page, 0, 0) + except Exception as e: self.logger.warning( message="Failed to perform full page scan: {error}", tag="PAGE_SCAN", - params={"error": str(e)} + params={"error": str(e)}, ) else: - await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - - + # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await self.safe_scroll(page, 0, total_height) + async def _handle_download(self, download): - """Handle file downloads.""" + """ + Handle file downloads. + + How it works: + 1. Get the suggested filename. + 2. Get the download path. + 3. Log the download. + 4. Start the download. + 5. Save the downloaded file. + 6. Log the completion. + + Args: + download (Download): The Playwright download object + + Returns: + None + """ try: suggested_filename = download.suggested_filename download_path = os.path.join(self.downloads_path, suggested_filename) - + self.logger.info( message="Downloading {filename} to {path}", tag="FETCH", - params={"filename": suggested_filename, "path": download_path} + params={"filename": suggested_filename, "path": download_path}, ) - + start_time = time.perf_counter() await download.save_as(download_path) end_time = time.perf_counter() @@ -1031,106 +1570,154 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.logger.success( message="Downloaded {filename} successfully", tag="COMPLETE", - params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} - ) + params={ + "filename": suggested_filename, + "path": download_path, + "duration": f"{end_time - start_time:.2f}s", + }, + ) except Exception as e: self.logger.error( message="Failed to handle download: {error}", tag="ERROR", - params={"error": str(e)} + params={"error": str(e)}, ) - - - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: - semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed - semaphore = asyncio.Semaphore(semaphore_count) - - async def crawl_with_semaphore(url): - async with semaphore: - return await self.crawl(url, **kwargs) - - tasks = [crawl_with_semaphore(url) for url in urls] - results = await asyncio.gather(*tasks, return_exceptions=True) - return [result if not isinstance(result, Exception) else str(result) for result in results] async def remove_overlay_elements(self, page: Page) -> None: """ Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. - + Args: page (Page): The Playwright page instance """ remove_overlays_js = load_js_script("remove_overlay_elements") - + try: - await page.evaluate(remove_overlays_js) + await page.evaluate(f""" + (() => {{ + try {{ + {remove_overlays_js} + return {{ success: true }}; + }} catch (error) {{ + return {{ + success: false, + error: error.toString(), + stack: error.stack + }}; + }} + }})() + """) await page.wait_for_timeout(500) # Wait for any animations to complete except Exception as e: self.logger.warning( message="Failed to remove overlay elements: {error}", tag="SCRAPE", - params={"error": str(e)} - ) + params={"error": str(e)}, + ) async def export_pdf(self, page: Page) -> bytes: """ Exports the current page as a PDF. + + Args: + page (Page): The Playwright page object + + Returns: + bytes: The PDF data """ pdf_data = await page.pdf(print_background=True) return pdf_data async def take_screenshot(self, page, **kwargs) -> str: - page_height = await page.evaluate("document.documentElement.scrollHeight") - if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD): + """ + Take a screenshot of the current page. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data + """ + dimensions = await self.get_page_dimensions(page) + page_height = dimensions['height'] + if page_height < kwargs.get( + "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD + ): # Page is short enough, just take a screenshot return await self.take_screenshot_naive(page) else: # Page is too long, try to take a full-page screenshot return await self.take_screenshot_scroller(page, **kwargs) - # return await self.take_screenshot_from_pdf(await self.export_pdf(page)) + # return await self.take_screenshot_from_pdf(await self.export_pdf(page)) async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: """ - Convert the first page of the PDF to a screenshot. + Convert the first page of the PDF to a screenshot. + Requires pdf2image and poppler. + + Args: + pdf_data (bytes): The PDF data + + Returns: + str: The base64-encoded screenshot data """ try: from pdf2image import convert_from_bytes + images = convert_from_bytes(pdf_data) - final_img = images[0].convert('RGB') + final_img = images[0].convert("RGB") buffered = BytesIO() final_img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') + return base64.b64encode(buffered.getvalue()).decode("utf-8") except Exception as e: error_message = f"Failed to take PDF-based screenshot: {str(e)}" self.logger.error( message="PDF Screenshot failed: {error}", tag="ERROR", - params={"error": error_message} + params={"error": error_message}, ) # Return error image as fallback - img = Image.new('RGB', (800, 600), color='black') + img = Image.new("RGB", (800, 600), color="black") draw = ImageDraw.Draw(img) font = ImageFont.load_default() draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) buffered = BytesIO() img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') + return base64.b64encode(buffered.getvalue()).decode("utf-8") async def take_screenshot_scroller(self, page: Page, **kwargs) -> str: """ Attempt to set a large viewport and take a full-page screenshot. If still too large, segment the page as before. + + Requires pdf2image and poppler. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data """ try: # Get page height - page_height = await page.evaluate("document.documentElement.scrollHeight") - page_width = await page.evaluate("document.documentElement.scrollWidth") + dimensions = await self.get_page_dimensions(page) + page_width = dimensions['width'] + page_height = dimensions['height'] + # page_height = await page.evaluate("document.documentElement.scrollHeight") + # page_width = await page.evaluate("document.documentElement.scrollWidth") # Set a large viewport - large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD)) - await page.set_viewport_size({"width": page_width, "height": large_viewport_height}) - + large_viewport_height = min( + page_height, + kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD), + ) + await page.set_viewport_size( + {"width": page_width, "height": large_viewport_height} + ) + # Page still too long, segment approach segments = [] viewport_size = page.viewport_size @@ -1142,21 +1729,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.evaluate(f"window.scrollTo(0, {y_offset})") await asyncio.sleep(0.01) # wait for render seg_shot = await page.screenshot(full_page=False) - img = Image.open(BytesIO(seg_shot)).convert('RGB') + img = Image.open(BytesIO(seg_shot)).convert("RGB") segments.append(img) total_height = sum(img.height for img in segments) - stitched = Image.new('RGB', (segments[0].width, total_height)) + stitched = Image.new("RGB", (segments[0].width, total_height)) offset = 0 for img in segments: # stitched.paste(img, (0, offset)) - stitched.paste(img.convert('RGB'), (0, offset)) + stitched.paste(img.convert("RGB"), (0, offset)) offset += img.height buffered = BytesIO() - stitched = stitched.convert('RGB') + stitched = stitched.convert("RGB") stitched.save(buffered, format="BMP", quality=85) - encoded = base64.b64encode(buffered.getvalue()).decode('utf-8') + encoded = base64.b64encode(buffered.getvalue()).decode("utf-8") return encoded except Exception as e: @@ -1164,107 +1751,411 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.logger.error( message="Large viewport screenshot failed: {error}", tag="ERROR", - params={"error": error_message} + params={"error": error_message}, ) # return error image - img = Image.new('RGB', (800, 600), color='black') + img = Image.new("RGB", (800, 600), color="black") draw = ImageDraw.Draw(img) font = ImageFont.load_default() draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) buffered = BytesIO() img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') + return base64.b64encode(buffered.getvalue()).decode("utf-8") finally: await page.close() - + async def take_screenshot_naive(self, page: Page) -> str: """ Takes a screenshot of the current page. - + Args: page (Page): The Playwright page instance - + Returns: str: Base64-encoded screenshot image """ try: # The page is already loaded, just take the screenshot screenshot = await page.screenshot(full_page=False) - return base64.b64encode(screenshot).decode('utf-8') + return base64.b64encode(screenshot).decode("utf-8") except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" self.logger.error( message="Screenshot failed: {error}", tag="ERROR", - params={"error": error_message} + params={"error": error_message}, ) - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') + img = Image.new("RGB", (800, 600), color="black") draw = ImageDraw.Draw(img) font = ImageFont.load_default() draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) - + buffered = BytesIO() img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') + return base64.b64encode(buffered.getvalue()).decode("utf-8") finally: await page.close() - + async def export_storage_state(self, path: str = None) -> dict: """ Exports the current storage state (cookies, localStorage, sessionStorage) to a JSON file at the specified path. + + Args: + path (str): The path to save the storage state JSON file + + Returns: + dict: The exported storage state """ if self.default_context: state = await self.default_context.storage_state(path=path) self.logger.info( message="Exported storage state to {path}", tag="INFO", - params={"path": path} + params={"path": path}, ) return state else: self.logger.warning( message="No default_context available to export storage state.", - tag="WARNING" + tag="WARNING", ) - - async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: + + async def robust_execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: """ - Generates a screenshot from raw HTML content. - - Args: - html (str): The HTML content to render and capture. - + Executes user-provided JavaScript code with proper error handling and context, + supporting both synchronous and async user code, plus navigations. + + How it works: + 1. Wait for load state 'domcontentloaded' + 2. If js_code is a string, execute it directly + 3. If js_code is a list, execute each element in sequence + 4. Wait for load state 'networkidle' + 5. Return results + + Args: + page (Page): The Playwright page instance + js_code (Union[str, List[str]]): The JavaScript code to execute + Returns: - Optional[str]: Base64-encoded screenshot image or an error image if failed. + Dict[str, Any]: The results of the execution """ try: - await self.start() - # Create a temporary page without a session_id - page, context = await self.browser_manager.get_page(None, self.user_agent) + await page.wait_for_load_state('domcontentloaded') - await page.set_content(html, wait_until='networkidle') - screenshot = await page.screenshot(full_page=True) - await page.close() - return base64.b64encode(screenshot).decode('utf-8') + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Attempt the evaluate + # If the user code triggers navigation, we catch the "context destroyed" error + # then wait for the new page to load before continuing + result = None + try: + result = await page.evaluate(f""" + (async () => {{ + try {{ + {script} + return {{ success: true }}; + }} catch (err) {{ + return {{ success: false, error: err.toString(), stack: err.stack }}; + }} + }})(); + """) + except Error as e: + # If it's due to navigation destroying the context, handle gracefully + if "Execution context was destroyed" in str(e): + self.logger.info("Navigation triggered by script, waiting for load state", tag="JS_EXEC") + try: + await page.wait_for_load_state('load', timeout=30000) + except Error as nav_err: + self.logger.warning( + message="Navigation wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)} + ) + try: + await page.wait_for_load_state('networkidle', timeout=30000) + except Error as nav_err: + self.logger.warning( + message="Network idle wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)} + ) + # Return partial success, or adapt as you see fit + result = { + "success": True, + "info": "Navigation triggered, ignoring context destroyed error" + } + else: + # It's some other error, log and continue + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + result = {"success": False, "error": str(e)} + + # If we made it this far with no repeated error, do post-load waits + t1 = time.time() + try: + await page.wait_for_load_state('domcontentloaded', timeout=5000) + print("DOM content loaded after script execution in", time.time() - t1) + except Error as e: + self.logger.warning( + message="DOM content load timeout: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + + # t1 = time.time() + # try: + # await page.wait_for_load_state('networkidle', timeout=5000) + # print("Network idle after script execution in", time.time() - t1) + # except Error as e: + # self.logger.warning( + # message="Network idle timeout: {error}", + # tag="JS_EXEC", + # params={"error": str(e)} + # ) + + results.append(result if result else {"success": True}) + + except Exception as e: + # Catch anything else + self.logger.error( + message="Script chunk failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" self.logger.error( - message="Screenshot failed: {error}", - tag="ERROR", - params={"error": error_message} - ) + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} - # Generate an error image - img = Image.new('RGB', (800, 600), color='black') - draw = ImageDraw.Draw(img) - font = ImageFont.load_default() - draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + async def execute_user_script(self, page: Page, js_code: Union[str, List[str]]) -> Dict[str, Any]: + """ + Executes user-provided JavaScript code with proper error handling and context. + + Args: + page: Playwright page object + js_code: Single JavaScript string or list of JavaScript code strings + + Returns: + Dict containing execution status and results/errors + """ + try: + # Ensure the page is ready for script execution + await page.wait_for_load_state('domcontentloaded') + + # Handle single script or multiple scripts + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Execute the script and wait for network idle + result = await page.evaluate(f""" + (() => {{ + return new Promise((resolve) => {{ + try {{ + const result = (function() {{ + {script} + }})(); + + // If result is a promise, wait for it + if (result instanceof Promise) {{ + result.then(() => {{ + // Wait a bit for any triggered effects + setTimeout(() => resolve({{ success: true }}), 100); + }}).catch(error => {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }}); + }} else {{ + // For non-promise results, still wait a bit for effects + setTimeout(() => resolve({{ success: true }}), 100); + }} + }} catch (error) {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }} + }}); + }})() + """) + + # Wait for network idle after script execution + t1 = time.time() + await page.wait_for_load_state('domcontentloaded', timeout=5000) + print("DOM content loaded after script execution in", time.time() - t1) - buffered = BytesIO() - img.save(buffered, format="JPEG") - return base64.b64encode(buffered.getvalue()).decode('utf-8') + t1 = time.time() + await page.wait_for_load_state('networkidle', timeout=5000) + print("Network idle after script execution in", time.time() - t1) + + results.append(result if result else {"success": True}) + + except Error as e: + # Handle Playwright-specific errors + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)} + ) + return {"success": False, "error": str(e)} + async def check_visibility(self, page): + """ + Checks if an element is visible on the page. + + Args: + page: Playwright page object + + Returns: + Boolean indicating visibility + """ + return await page.evaluate(""" + () => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + } + """) + + async def safe_scroll(self, page: Page, x: int, y: int): + """ + Safely scroll the page with rendering time. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + """ + result = await self.csp_scroll_to(page, x, y) + if result['success']: + await page.wait_for_timeout(100) # Allow for rendering + return result + + async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: + """ + Performs a CSP-compliant scroll operation and returns the result status. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + + Returns: + Dict containing scroll status and position information + """ + try: + result = await page.evaluate( + f"""() => {{ + try {{ + const startX = window.scrollX; + const startY = window.scrollY; + window.scrollTo({x}, {y}); + + // Get final position after scroll + const endX = window.scrollX; + const endY = window.scrollY; + + return {{ + success: true, + startPosition: {{ x: startX, y: startY }}, + endPosition: {{ x: endX, y: endY }}, + targetPosition: {{ x: {x}, y: {y} }}, + delta: {{ + x: Math.abs(endX - {x}), + y: Math.abs(endY - {y}) + }} + }}; + }} catch (e) {{ + return {{ + success: false, + error: e.toString() + }}; + }} + }}""" + ) + + if not result['success']: + self.logger.warning( + message="Scroll operation failed: {error}", + tag="SCROLL", + params={"error": result.get('error')} + ) + + return result + + except Exception as e: + self.logger.error( + message="Failed to execute scroll: {error}", + tag="SCROLL", + params={"error": str(e)} + ) + return { + "success": False, + "error": str(e) + } + + async def get_page_dimensions(self, page: Page): + """ + Get the dimensions of the page. + + Args: + page: Playwright page object + + Returns: + Dict containing width and height of the page + """ + return await page.evaluate(""" + () => { + const {scrollWidth, scrollHeight} = document.documentElement; + return {width: scrollWidth, height: scrollHeight}; + } + """) \ No newline at end of file diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 5cdafac2..aed9c76b 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -7,7 +7,7 @@ from contextlib import asynccontextmanager import logging import json # Added for serialization/deserialization from .utils import ensure_content_dirs, generate_content_hash -from .models import CrawlResult +from .models import CrawlResult, MarkdownGenerationResult import xxhash import aiofiles from .config import NEED_MIGRATION @@ -295,13 +295,18 @@ class AsyncDatabaseManager: row_dict[field] = "" # Parse JSON fields - json_fields = ['media', 'links', 'metadata', 'response_headers'] + json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown'] for field in json_fields: try: row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {} except json.JSONDecodeError: row_dict[field] = {} + if isinstance(row_dict['markdown'], Dict): + row_dict['markdown_v2'] = row_dict['markdown'] + if row_dict['markdown'].get('raw_markdown'): + row_dict['markdown'] = row_dict['markdown']['raw_markdown'] + # Parse downloaded_files try: row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else [] @@ -331,10 +336,28 @@ class AsyncDatabaseManager: content_map = { 'html': (result.html, 'html'), 'cleaned_html': (result.cleaned_html or "", 'cleaned'), - 'markdown': (result.markdown or "", 'markdown'), + 'markdown': None, 'extracted_content': (result.extracted_content or "", 'extracted'), 'screenshot': (result.screenshot or "", 'screenshots') } + + try: + if isinstance(result.markdown, MarkdownGenerationResult): + content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown') + elif hasattr(result, 'markdown_v2'): + content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown') + elif isinstance(result.markdown, str): + markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown) + content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown') + else: + content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown') + except Exception as e: + self.logger.warning( + message=f"Error processing markdown content: {str(e)}", + tag="WARNING" + ) + # Fallback to empty markdown result + content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown') content_hashes = {} for field, (content, content_type) in content_map.items(): diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index 220edd11..5d2d54b5 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -42,7 +42,7 @@ class AsyncLogger: def __init__( self, log_file: Optional[str] = None, - log_level: LogLevel = LogLevel.INFO, + log_level: LogLevel = LogLevel.DEBUG, tag_width: int = 10, icons: Optional[Dict[str, str]] = None, colors: Optional[Dict[LogLevel, str]] = None, diff --git a/crawl4ai/async_tools.py b/crawl4ai/async_tools.py deleted file mode 100644 index 157e5596..00000000 --- a/crawl4ai/async_tools.py +++ /dev/null @@ -1,183 +0,0 @@ -import asyncio -import base64 -import time -from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional, Awaitable -import os, sys, shutil -import tempfile, subprocess -from playwright.async_api import async_playwright, Page, Browser, Error -from playwright.async_api import TimeoutError as PlaywrightTimeoutError -from io import BytesIO -from PIL import Image, ImageDraw, ImageFont -from pathlib import Path -from playwright.async_api import ProxySettings -from pydantic import BaseModel -import hashlib -import json -import uuid -from .models import AsyncCrawlResponse -from .utils import create_box_message -from .user_agent_generator import UserAgentGenerator -from playwright_stealth import StealthConfig, stealth_async - - -class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222): - self.browser_type = browser_type - self.user_data_dir = user_data_dir - self.headless = headless - self.browser_process = None - self.temp_dir = None - self.debugging_port = debugging_port - self.host = host - self.logger = logger - self.shutting_down = False - - async def start(self) -> str: - """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. - """ - - # Create temp dir if needed - if not self.user_data_dir: - self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-") - self.user_data_dir = self.temp_dir - - # Get browser path and args based on OS and browser type - browser_path = self._get_browser_path() - args = self._get_browser_args() - - # Start browser process - try: - self.browser_process = subprocess.Popen( - args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - # Monitor browser process output for errors - asyncio.create_task(self._monitor_browser_process()) - await asyncio.sleep(2) # Give browser time to start - return f"http://{self.host}:{self.debugging_port}" - except Exception as e: - await self.cleanup() - raise Exception(f"Failed to start browser: {e}") - - async def _monitor_browser_process(self): - """Monitor the browser process for unexpected termination.""" - if self.browser_process: - try: - stdout, stderr = await asyncio.gather( - asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read) - ) - - # Check shutting_down flag BEFORE logging anything - if self.browser_process.poll() is not None: - if not self.shutting_down: - self.logger.error( - message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", - tag="ERROR", - params={ - "code": self.browser_process.returncode, - "stdout": stdout.decode(), - "stderr": stderr.decode() - } - ) - await self.cleanup() - else: - self.logger.info( - message="Browser process terminated normally | Code: {code}", - tag="INFO", - params={"code": self.browser_process.returncode} - ) - except Exception as e: - if not self.shutting_down: - self.logger.error( - message="Error monitoring browser process: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - def _get_browser_path(self) -> str: - """Returns the browser executable path based on OS and browser type""" - if sys.platform == "darwin": # macOS - paths = { - "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", - "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox", - "webkit": "/Applications/Safari.app/Contents/MacOS/Safari" - } - elif sys.platform == "win32": # Windows - paths = { - "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe", - "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe", - "webkit": None # WebKit not supported on Windows - } - else: # Linux - paths = { - "chromium": "google-chrome", - "firefox": "firefox", - "webkit": None # WebKit not supported on Linux - } - - return paths.get(self.browser_type) - - def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [self._get_browser_path()] - - if self.browser_type == "chromium": - args = [ - f"--remote-debugging-port={self.debugging_port}", - f"--user-data-dir={self.user_data_dir}", - ] - if self.headless: - args.append("--headless=new") - elif self.browser_type == "firefox": - args = [ - "--remote-debugging-port", str(self.debugging_port), - "--profile", self.user_data_dir, - ] - if self.headless: - args.append("--headless") - else: - raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args - - async def cleanup(self): - """Cleanup browser process and temporary directory""" - # Set shutting_down flag BEFORE any termination actions - self.shutting_down = True - - if self.browser_process: - try: - self.browser_process.terminate() - # Wait for process to end gracefully - for _ in range(10): # 10 attempts, 100ms each - if self.browser_process.poll() is not None: - break - await asyncio.sleep(0.1) - - # Force kill if still running - if self.browser_process.poll() is None: - self.browser_process.kill() - await asyncio.sleep(0.1) # Brief wait for kill to take effect - - except Exception as e: - self.logger.error( - message="Error terminating browser: {error}", - tag="ERROR", - params={"error": str(e)} - ) - - if self.temp_dir and os.path.exists(self.temp_dir): - try: - shutil.rmtree(self.temp_dir) - except Exception as e: - self.logger.error( - message="Error removing temporary directory: {error}", - tag="ERROR", - params={"error": str(e)} - ) - diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 9b968158..f99586a3 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -42,13 +42,65 @@ class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. + There are two ways to use the crawler: + + 1. Using context manager (recommended for simple cases): + ```python + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") + ``` + + 2. Using explicit lifecycle management (recommended for long-running applications): + ```python + crawler = AsyncWebCrawler() + await crawler.start() + + # Use the crawler multiple times + result1 = await crawler.arun(url="https://example.com") + result2 = await crawler.arun(url="https://another.com") + + await crawler.close() + ``` + Migration Guide: Old way (deprecated): crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) New way (recommended): browser_config = BrowserConfig(browser_type="chromium", headless=True) - crawler = AsyncWebCrawler(browser_config=browser_config) + crawler = AsyncWebCrawler(config=browser_config) + + + Attributes: + browser_config (BrowserConfig): Configuration object for browser settings. + crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. + logger (AsyncLogger): Logger instance for recording events and errors. + always_bypass_cache (bool): Whether to always bypass cache. + crawl4ai_folder (str): Directory for storing cache. + base_directory (str): Base directory for storing cache. + ready (bool): Whether the crawler is ready for use. + + Methods: + start(): Start the crawler explicitly without using context manager. + close(): Close the crawler explicitly without using context manager. + arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). + awarmup(): Perform warmup sequence. + arun_many(): Run the crawler for multiple sources. + aprocess_html(): Process HTML content. + + Typical Usage: + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com") + print(result.markdown) + + Using configuration: + browser_config = BrowserConfig(browser_type="chromium", headless=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + result = await crawler.arun(url="https://example.com", config=crawler_config) + print(result.markdown) """ _domain_last_hit = {} @@ -97,12 +149,19 @@ class AsyncWebCrawler: # Initialize crawler strategy + params = { + k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger'] + } self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( browser_config=browser_config, logger=self.logger, - **kwargs # Pass remaining kwargs for backwards compatibility + **params # Pass remaining kwargs for backwards compatibility ) + # If craweler strategy doesnt have logger, use crawler logger + if not self.crawler_strategy.logger: + self.crawler_strategy.logger = self.logger + # Handle deprecated cache parameter if always_by_pass_cache is not None: if kwargs.get("warning", True): @@ -127,16 +186,49 @@ class AsyncWebCrawler: self.ready = False - async def __aenter__(self): + async def start(self): + """ + Start the crawler explicitly without using context manager. + This is equivalent to using 'async with' but gives more control over the lifecycle. + + This method will: + 1. Initialize the browser and context + 2. Perform warmup sequence + 3. Return the crawler instance for method chaining + + Returns: + AsyncWebCrawler: The initialized crawler instance + """ await self.crawler_strategy.__aenter__() await self.awarmup() return self + async def close(self): + """ + Close the crawler explicitly without using context manager. + This should be called when you're done with the crawler if you used start(). + + This method will: + 1. Clean up browser resources + 2. Close any open pages and contexts + """ + await self.crawler_strategy.__aexit__(None, None, None) + + async def __aenter__(self): + return await self.start() + async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) + await self.close() async def awarmup(self): - """Initialize the crawler with warm-up sequence.""" + """ + Initialize the crawler with warm-up sequence. + + This method: + 1. Logs initialization info + 2. Sets up browser configuration + 3. Marks the crawler as ready + """ self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.ready = True @@ -144,7 +236,7 @@ class AsyncWebCrawler: async def nullcontext(self): """异步空上下文管理器""" yield - + async def arun( self, url: str, @@ -179,7 +271,7 @@ class AsyncWebCrawler: screenshot=True, ... ) - + New way (recommended): config = CrawlerRunConfig( word_count_threshold=200, @@ -192,7 +284,7 @@ class AsyncWebCrawler: url: The URL to crawl (http://, https://, file://, or raw:) crawler_config: Configuration object controlling crawl behavior [other parameters maintained for backwards compatibility] - + Returns: CrawlResult: The result of crawling and processing """ @@ -204,14 +296,14 @@ class AsyncWebCrawler: try: # Handle configuration if crawler_config is not None: - if any(param is not None for param in [ - word_count_threshold, extraction_strategy, chunking_strategy, - content_filter, cache_mode, css_selector, screenshot, pdf - ]): - self.logger.warning( - message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", - tag="WARNING" - ) + # if any(param is not None for param in [ + # word_count_threshold, extraction_strategy, chunking_strategy, + # content_filter, cache_mode, css_selector, screenshot, pdf + # ]): + # self.logger.warning( + # message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", + # tag="WARNING" + # ) config = crawler_config else: # Merge all parameters into a single kwargs dict for config creation @@ -261,7 +353,7 @@ class AsyncWebCrawler: # Initialize processing variables async_response: AsyncCrawlResponse = None - cached_result = None + cached_result: CrawlResult = None screenshot_data = None pdf_data = None extracted_content = None @@ -274,6 +366,7 @@ class AsyncWebCrawler: if cached_result: html = sanitize_input_encode(cached_result.html) extracted_content = sanitize_input_encode(cached_result.extracted_content or "") + extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content # If screenshot is requested but its not in cache, then set cache_result to None screenshot_data = cached_result.screenshot pdf_data = cached_result.pdf @@ -312,49 +405,89 @@ class AsyncWebCrawler: tag="FETCH" ) - # Process the HTML content - crawl_result = await self.aprocess_html( - url=url, - html=html, - extracted_content=extracted_content, - config=config, # Pass the config object instead of individual parameters - screenshot=screenshot_data, - pdf_data=pdf_data, - verbose=config.verbose, - **kwargs - ) + # Process the HTML content + crawl_result = await self.aprocess_html( + url=url, + html=html, + extracted_content=extracted_content, + config=config, # Pass the config object instead of individual parameters + screenshot=screenshot_data, + pdf_data=pdf_data, + verbose=config.verbose, + is_raw_html = True if url.startswith("raw:") else False, + **kwargs + ) + + # crawl_result.status_code = async_response.status_code + # crawl_result.response_headers = async_response.response_headers + # crawl_result.downloaded_files = async_response.downloaded_files + # crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate + # else: + # crawl_result.status_code = 200 + # crawl_result.response_headers = cached_result.response_headers if cached_result else {} + # crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache + + # # Check and set values from async_response to crawl_result + try: + for key in vars(async_response): + if hasattr(crawl_result, key): + value = getattr(async_response, key, None) + current_value = getattr(crawl_result, key, None) + if value is not None and not current_value: + try: + setattr(crawl_result, key, value) + except Exception as e: + self.logger.warning( + message=f"Failed to set attribute {key}: {str(e)}", + tag="WARNING" + ) + except Exception as e: + self.logger.warning( + message=f"Error copying response attributes: {str(e)}", + tag="WARNING" + ) + + crawl_result.success = bool(html) + crawl_result.session_id = getattr(config, 'session_id', None) + + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) + + return crawl_result - # Set response data - if async_response: - crawl_result.status_code = async_response.status_code - crawl_result.response_headers = async_response.response_headers - crawl_result.downloaded_files = async_response.downloaded_files else: - crawl_result.status_code = 200 - crawl_result.response_headers = cached_result.response_headers if cached_result else {} + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": True, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN, + "timing": Fore.YELLOW + } + ) - crawl_result.success = bool(html) - crawl_result.session_id = getattr(config, 'session_id', None) - - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": crawl_result.success, - "timing": f"{time.perf_counter() - start_time:.2f}s" - }, - colors={ - "status": Fore.GREEN if crawl_result.success else Fore.RED, - "timing": Fore.YELLOW - } - ) - - # Update cache if appropriate - if cache_context.should_write() and not bool(cached_result): - await async_db_manager.acache_url(crawl_result) - - return crawl_result + cached_result.success = bool(html) + cached_result.session_id = getattr(config, 'session_id', None) + return cached_result except Exception as e: error_context = get_error_context(sys.exc_info()) @@ -401,6 +534,7 @@ class AsyncWebCrawler: extracted_content: Previously extracted content (if any) config: Configuration object controlling processing behavior screenshot: Screenshot data (if any) + pdf_data: PDF data (if any) verbose: Whether to enable verbose logging **kwargs: Additional parameters for backwards compatibility @@ -415,15 +549,20 @@ class AsyncWebCrawler: scrapping_strategy = WebScrapingStrategy(logger=self.logger) # Process HTML content + params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} + # add keys from kwargs to params that doesn't exist in params + params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) + result = scrapping_strategy.scrap( url, html, - word_count_threshold=config.word_count_threshold, - css_selector=config.css_selector, - only_text=config.only_text, - image_description_min_word_threshold=config.image_description_min_word_threshold, - content_filter=config.content_filter, - **kwargs + **params, + # word_count_threshold=config.word_count_threshold, + # css_selector=config.css_selector, + # only_text=config.only_text, + # image_description_min_word_threshold=config.image_description_min_word_threshold, + # content_filter=config.content_filter, + # **kwargs ) if result is None: @@ -475,15 +614,27 @@ class AsyncWebCrawler: t1 = time.perf_counter() - # Handle different extraction strategy types - if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonCssExtractionStrategy)): - config.extraction_strategy.verbose = verbose - extracted_content = config.extraction_strategy.run(url, [html]) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - else: - sections = config.chunking_strategy.chunk(markdown) - extracted_content = config.extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) + # Choose content based on input_format + content_format = config.extraction_strategy.input_format + if content_format == "fit_markdown" and not markdown_result.fit_markdown: + self.logger.warning( + message="Fit markdown requested but not available. Falling back to raw markdown.", + tag="EXTRACT", + params={"url": _url} + ) + content_format = "markdown" + + content = { + "markdown": markdown, + "html": html, + "fit_markdown": markdown_result.raw_markdown + }.get(content_format, markdown) + + # Use IdentityChunking for HTML input, otherwise use provided chunking strategy + chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy + sections = chunking.chunk(content) + extracted_content = config.extraction_strategy.run(url, sections) + extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) # Log extraction completion self.logger.info( @@ -682,5 +833,3 @@ class AsyncWebCrawler: async def aget_cache_size(self): """Get the total number of cached items.""" return await async_db_manager.aget_total_count() - - diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py index 429eacc1..588edd62 100644 --- a/crawl4ai/cache_context.py +++ b/crawl4ai/cache_context.py @@ -25,8 +25,26 @@ class CacheContext: This class centralizes all cache-related logic and URL type checking, making the caching behavior more predictable and maintainable. + + Attributes: + url (str): The URL being processed. + cache_mode (CacheMode): The cache mode for the current operation. + always_bypass (bool): If True, bypasses caching for this operation. + is_cacheable (bool): True if the URL is cacheable, False otherwise. + is_web_url (bool): True if the URL is a web URL, False otherwise. + is_local_file (bool): True if the URL is a local file, False otherwise. + is_raw_html (bool): True if the URL is raw HTML, False otherwise. + _url_display (str): The display name for the URL (web, local file, or raw HTML). """ def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False): + """ + Initializes the CacheContext with the provided URL and cache mode. + + Args: + url (str): The URL being processed. + cache_mode (CacheMode): The cache mode for the current operation. + always_bypass (bool): If True, bypasses caching for this operation. + """ self.url = url self.cache_mode = cache_mode self.always_bypass = always_bypass @@ -37,13 +55,31 @@ class CacheContext: self._url_display = url if not self.is_raw_html else "Raw HTML" def should_read(self) -> bool: - """Determines if cache should be read based on context.""" + """ + Determines if cache should be read based on context. + + How it works: + 1. If always_bypass is True or is_cacheable is False, return False. + 2. If cache_mode is ENABLED or READ_ONLY, return True. + + Returns: + bool: True if cache should be read, False otherwise. + """ if self.always_bypass or not self.is_cacheable: return False return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY] def should_write(self) -> bool: - """Determines if cache should be written based on context.""" + """ + Determines if cache should be written based on context. + + How it works: + 1. If always_bypass is True or is_cacheable is False, return False. + 2. If cache_mode is ENABLED or WRITE_ONLY, return True. + + Returns: + bool: True if cache should be written, False otherwise. + """ if self.always_bypass or not self.is_cacheable: return False return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY] diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index af857947..7b8c08ad 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -7,17 +7,43 @@ from .utils import * # Define the abstract base class for chunking strategies class ChunkingStrategy(ABC): + """ + Abstract base class for chunking strategies. + """ @abstractmethod def chunk(self, text: str) -> list: """ Abstract method to chunk the given text. + + Args: + text (str): The text to chunk. + + Returns: + list: A list of chunks. """ pass - + +# Create an identity chunking strategy f(x) = [x] +class IdentityChunking(ChunkingStrategy): + """ + Chunking strategy that returns the input text as a single chunk. + """ + def chunk(self, text: str) -> list: + return [text] + # Regex-based chunking class RegexChunking(ChunkingStrategy): + """ + Chunking strategy that splits text based on regular expression patterns. + """ def __init__(self, patterns=None, **kwargs): + """ + Initialize the RegexChunking object. + + Args: + patterns (list): A list of regular expression patterns to split text. + """ if patterns is None: patterns = [r'\n\n'] # Default split pattern self.patterns = patterns @@ -33,9 +59,15 @@ class RegexChunking(ChunkingStrategy): # NLP-based sentence chunking class NlpSentenceChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into sentences using NLTK's sentence tokenizer. + """ def __init__(self, **kwargs): + """ + Initialize the NlpSentenceChunking object. + """ load_nltk_punkt() - pass + def chunk(self, text: str) -> list: # Improved regex for sentence splitting @@ -52,8 +84,21 @@ class NlpSentenceChunking(ChunkingStrategy): # Topic-based segmentation using TextTiling class TopicSegmentationChunking(ChunkingStrategy): + """ + Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer. + + How it works: + 1. Segment the text into topics using TextTilingTokenizer + 2. Extract keywords for each topic segment + """ def __init__(self, num_keywords=3, **kwargs): + """ + Initialize the TopicSegmentationChunking object. + + Args: + num_keywords (int): The number of keywords to extract for each topic segment. + """ import nltk as nl self.tokenizer = nl.tokenize.TextTilingTokenizer() self.num_keywords = num_keywords @@ -83,6 +128,14 @@ class TopicSegmentationChunking(ChunkingStrategy): # Fixed-length word chunks class FixedLengthWordChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into fixed-length word chunks. + + How it works: + 1. Split the text into words + 2. Create chunks of fixed length + 3. Return the list of chunks + """ def __init__(self, chunk_size=100, **kwargs): """ Initialize the fixed-length word chunking strategy with the given chunk size. @@ -98,6 +151,14 @@ class FixedLengthWordChunking(ChunkingStrategy): # Sliding window chunking class SlidingWindowChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into overlapping word chunks. + + How it works: + 1. Split the text into words + 2. Create chunks of fixed length + 3. Return the list of chunks + """ def __init__(self, window_size=100, step=50, **kwargs): """ Initialize the sliding window chunking strategy with the given window size and @@ -127,8 +188,16 @@ class SlidingWindowChunking(ChunkingStrategy): return chunks - class OverlappingWindowChunking(ChunkingStrategy): + """ + Chunking strategy that splits text into overlapping word chunks. + + How it works: + 1. Split the text into words using whitespace + 2. Create chunks of fixed length equal to the window size + 3. Slide the window by the overlap size + 4. Return the list of chunks + """ def __init__(self, window_size=1000, overlap=100, **kwargs): """ Initialize the overlapping window chunking strategy with the given window size and diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py new file mode 100644 index 00000000..4a01c1c2 --- /dev/null +++ b/crawl4ai/cli.py @@ -0,0 +1,105 @@ +import click +import sys +import asyncio +from typing import List +from .docs_manager import DocsManager +from .async_logger import AsyncLogger + +logger = AsyncLogger(verbose=True) +docs_manager = DocsManager(logger) + +def print_table(headers: List[str], rows: List[List[str]], padding: int = 2): + """Print formatted table with headers and rows""" + widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)] + border = '+' + '+'.join('-' * (w + 2 * padding) for w in widths) + '+' + + def format_row(row): + return '|' + '|'.join(f"{' ' * padding}{str(cell):<{w}}{' ' * padding}" + for cell, w in zip(row, widths)) + '|' + + click.echo(border) + click.echo(format_row(headers)) + click.echo(border) + for row in rows: + click.echo(format_row(row)) + click.echo(border) + +@click.group() +def cli(): + """Crawl4AI Command Line Interface""" + pass + +@cli.group() +def docs(): + """Documentation operations""" + pass + +@docs.command() +@click.argument('sections', nargs=-1) +@click.option('--mode', type=click.Choice(['extended', 'condensed']), default='extended') +def combine(sections: tuple, mode: str): + """Combine documentation sections""" + try: + asyncio.run(docs_manager.ensure_docs_exist()) + click.echo(docs_manager.generate(sections, mode)) + except Exception as e: + logger.error(str(e), tag="ERROR") + sys.exit(1) + +@docs.command() +@click.argument('query') +@click.option('--top-k', '-k', default=5) +@click.option('--build-index', is_flag=True, help='Build index if missing') +def search(query: str, top_k: int, build_index: bool): + """Search documentation""" + try: + result = docs_manager.search(query, top_k) + if result == "No search index available. Call build_search_index() first.": + if build_index or click.confirm('No search index found. Build it now?'): + asyncio.run(docs_manager.llm_text.generate_index_files()) + result = docs_manager.search(query, top_k) + click.echo(result) + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +@docs.command() +def update(): + """Update docs from GitHub""" + try: + asyncio.run(docs_manager.fetch_docs()) + click.echo("Documentation updated successfully") + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +@docs.command() +@click.option('--force-facts', is_flag=True, help='Force regenerate fact files') +@click.option('--clear-cache', is_flag=True, help='Clear BM25 cache') +def index(force_facts: bool, clear_cache: bool): + """Build or rebuild search indexes""" + try: + asyncio.run(docs_manager.ensure_docs_exist()) + asyncio.run(docs_manager.llm_text.generate_index_files( + force_generate_facts=force_facts, + clear_bm25_cache=clear_cache + )) + click.echo("Search indexes built successfully") + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +# Add docs list command +@docs.command() +def list(): + """List available documentation sections""" + try: + sections = docs_manager.list() + print_table(["Sections"], [[section] for section in sections]) + + except Exception as e: + click.echo(f"Error: {str(e)}", err=True) + sys.exit(1) + +if __name__ == '__main__': + cli() \ No newline at end of file diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 7c8a9314..c2be7638 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -13,6 +13,8 @@ PROVIDER_MODELS = { "groq/llama3-8b-8192": os.getenv("GROQ_API_KEY"), "openai/gpt-4o-mini": os.getenv("OPENAI_API_KEY"), "openai/gpt-4o": os.getenv("OPENAI_API_KEY"), + "openai/o1-mini": os.getenv("OPENAI_API_KEY"), + "openai/o1-preview": os.getenv("OPENAI_API_KEY"), "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"), "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"), diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index f05b92fa..ab5ae517 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -9,17 +9,8 @@ from .utils import clean_tokens from abc import ABC, abstractmethod import math from snowballstemmer import stemmer - - -# import regex -# def tokenize_text(text): -# # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters -# pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]' -# return regex.findall(pattern, text) - -# from nltk.stem import PorterStemmer -# ps = PorterStemmer() class RelevantContentFilter(ABC): + """Abstract base class for content filtering strategies""" def __init__(self, user_query: str = None): self.user_query = user_query self.included_tags = { @@ -171,9 +162,8 @@ class RelevantContentFilter(ABC): chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold] return chunks - - def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: + def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: """Common method for extracting text chunks""" _text_cache = {} def fast_text(element: Tag) -> str: @@ -271,7 +261,38 @@ class RelevantContentFilter(ABC): return str(tag) # Fallback to original if anything fails class BM25ContentFilter(RelevantContentFilter): + """ + Content filtering using BM25 algorithm with priority tag handling. + + How it works: + 1. Extracts page metadata with fallbacks. + 2. Extracts text chunks from the body element. + 3. Tokenizes the corpus and query. + 4. Applies BM25 algorithm to calculate scores for each chunk. + 5. Filters out chunks below the threshold. + 6. Sorts chunks by score in descending order. + 7. Returns the top N chunks. + + Attributes: + user_query (str): User query for filtering (optional). + bm25_threshold (float): BM25 threshold for filtering (default: 1.0). + language (str): Language for stemming (default: 'english'). + + Methods: + filter_content(self, html: str, min_word_threshold: int = None) + """ def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'): + """ + Initializes the BM25ContentFilter class, if not provided, falls back to page metadata. + + Note: + If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph. + + Args: + user_query (str): User query for filtering (optional). + bm25_threshold (float): BM25 threshold for filtering (default: 1.0). + language (str): Language for stemming (default: 'english'). + """ super().__init__(user_query=user_query) self.bm25_threshold = bm25_threshold self.priority_tags = { @@ -290,7 +311,20 @@ class BM25ContentFilter(RelevantContentFilter): self.stemmer = stemmer(language) def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: - """Implements content filtering using BM25 algorithm with priority tag handling""" + """ + Implements content filtering using BM25 algorithm with priority tag handling. + + Note: + This method implements the filtering logic for the BM25ContentFilter class. + It takes HTML content as input and returns a list of filtered text chunks. + + Args: + html (str): HTML content to be filtered. + min_word_threshold (int): Minimum word threshold for filtering (optional). + + Returns: + List[str]: List of filtered text chunks. + """ if not html or not isinstance(html, str): return [] @@ -357,15 +391,42 @@ class BM25ContentFilter(RelevantContentFilter): return [self.clean_element(tag) for _, _, tag in selected_candidates] - - - - - class PruningContentFilter(RelevantContentFilter): + """ + Content filtering using pruning algorithm with dynamic threshold. + + How it works: + 1. Extracts page metadata with fallbacks. + 2. Extracts text chunks from the body element. + 3. Applies pruning algorithm to calculate scores for each chunk. + 4. Filters out chunks below the threshold. + 5. Sorts chunks by score in descending order. + 6. Returns the top N chunks. + + Attributes: + user_query (str): User query for filtering (optional), if not provided, falls back to page metadata. + min_word_threshold (int): Minimum word threshold for filtering (optional). + threshold_type (str): Threshold type for dynamic threshold (default: 'fixed'). + threshold (float): Fixed threshold value (default: 0.48). + + Methods: + filter_content(self, html: str, min_word_threshold: int = None): + """ def __init__(self, user_query: str = None, min_word_threshold: int = None, threshold_type: str = 'fixed', threshold: float = 0.48): - super().__init__(user_query) + """ + Initializes the PruningContentFilter class, if not provided, falls back to page metadata. + + Note: + If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph. + + Args: + user_query (str): User query for filtering (optional). + min_word_threshold (int): Minimum word threshold for filtering (optional). + threshold_type (str): Threshold type for dynamic threshold (default: 'fixed'). + threshold (float): Fixed threshold value (default: 0.48). + """ + super().__init__(None) self.min_word_threshold = min_word_threshold self.threshold_type = threshold_type self.threshold = threshold @@ -418,6 +479,20 @@ class PruningContentFilter(RelevantContentFilter): } def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]: + """ + Implements content filtering using pruning algorithm with dynamic threshold. + + Note: + This method implements the filtering logic for the PruningContentFilter class. + It takes HTML content as input and returns a list of filtered text chunks. + + Args: + html (str): HTML content to be filtered. + min_word_threshold (int): Minimum word threshold for filtering (optional). + + Returns: + List[str]: List of filtered text chunks. + """ if not html or not isinstance(html, str): return [] @@ -444,15 +519,23 @@ class PruningContentFilter(RelevantContentFilter): return content_blocks def _remove_comments(self, soup): + """Removes HTML comments""" for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() def _remove_unwanted_tags(self, soup): + """Removes unwanted tags""" for tag in self.excluded_tags: for element in soup.find_all(tag): element.decompose() def _prune_tree(self, node): + """ + Prunes the tree starting from the given node. + + Args: + node (Tag): The node from which the pruning starts. + """ if not node or not hasattr(node, 'name') or node.name is None: return @@ -495,6 +578,7 @@ class PruningContentFilter(RelevantContentFilter): self._prune_tree(child) def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len): + """Computes the composite score""" if self.min_word_threshold: # Get raw text from metrics node - avoid extra processing text = metrics['node'].get_text(strip=True) @@ -531,6 +615,7 @@ class PruningContentFilter(RelevantContentFilter): return score / total_weight if total_weight > 0 else 0 def _compute_class_id_weight(self, node): + """Computes the class ID weight""" class_id_score = 0 if 'class' in node.attrs: classes = ' '.join(node['class']) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 4ba9a605..985ff592 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1,4 +1,5 @@ import re # Point 1: Pre-Compile Regular Expressions +import time from abc import ABC, abstractmethod from typing import Dict, Any, Optional from bs4 import BeautifulSoup @@ -16,7 +17,8 @@ from .models import MarkdownGenerationResult from .utils import ( extract_metadata, normalize_url, - is_external_url + is_external_url, + get_base_domain, ) @@ -62,6 +64,17 @@ class ContentScrapingStrategy(ABC): pass class WebScrapingStrategy(ContentScrapingStrategy): + """ + Class for web content scraping. Perhaps the most important class. + + How it works: + 1. Extract content from HTML using BeautifulSoup. + 2. Clean the extracted content using a content cleaning strategy. + 3. Filter the cleaned content using a content filtering strategy. + 4. Generate markdown content from the filtered content. + 5. Return the markdown content. + """ + def __init__(self, logger=None): self.logger = logger @@ -72,17 +85,57 @@ class WebScrapingStrategy(ContentScrapingStrategy): log_method(message=message, tag=tag, **kwargs) def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + """ + Main entry point for content scraping. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: + + - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. + - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. + - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. + - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + """ return self._scrap(url, html, is_async=False, **kwargs) async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: + """ + Main entry point for asynchronous content scraping. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys: + + - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'. + - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'. + - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'. + - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown' + """ return await asyncio.to_thread(self._scrap, url, html, **kwargs) - def _generate_markdown_content(self, - cleaned_html: str, - html: str, - url: str, - success: bool, - **kwargs) -> Dict[str, Any]: + def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]: + """ + Generate markdown content from cleaned HTML. + + Args: + cleaned_html (str): The cleaned HTML content. + html (str): The original HTML content. + url (str): The URL of the page. + success (bool): Whether the content was successfully cleaned. + **kwargs: Additional keyword arguments. + + Returns: + Dict[str, Any]: A dictionary containing the generated markdown content. + """ markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator()) if markdown_generator: @@ -156,6 +209,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): """ def flatten_nested_elements(self, node): + """ + Flatten nested elements in a HTML tree. + + Args: + node (Tag): The root node of the HTML tree. + + Returns: + Tag: The flattened HTML tree. + """ if isinstance(node, NavigableString): return node if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name: @@ -164,6 +226,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): return node def find_closest_parent_with_useful_text(self, tag, **kwargs): + """ + Find the closest parent with useful text. + + Args: + tag (Tag): The starting tag to search from. + **kwargs: Additional keyword arguments. + + Returns: + Tag: The closest parent with useful text, or None if not found. + """ image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) current_tag = tag while current_tag: @@ -177,6 +249,17 @@ class WebScrapingStrategy(ContentScrapingStrategy): return None def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False): + """ + Remove unwanted attributes from an HTML element. + + Args: + element (Tag): The HTML element to remove attributes from. + important_attrs (list): List of important attributes to keep. + keep_data_attributes (bool): Whether to keep data attributes. + + Returns: + None + """ attrs_to_remove = [] for attr in element.attrs: if attr not in important_attrs: @@ -190,6 +273,26 @@ class WebScrapingStrategy(ContentScrapingStrategy): del element[attr] def process_image(self, img, url, index, total_images, **kwargs): + """ + Process an image element. + + How it works: + 1. Check if the image has valid display and inside undesired html elements. + 2. Score an image for it's usefulness. + 3. Extract image file metadata to extract size and extension. + 4. Generate a dictionary with the processed image information. + 5. Return the processed image information. + + Args: + img (Tag): The image element to process. + url (str): The URL of the page containing the image. + index (int): The index of the image in the list of images. + total_images (int): The total number of images in the list. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the processed image information. + """ parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') if ' ' in u else None} for u in [f"http{p}" for p in s.split("http") if p]] @@ -197,12 +300,15 @@ class WebScrapingStrategy(ContentScrapingStrategy): # Constants for checks classes_to_check = frozenset(['button', 'icon', 'logo']) tags_to_check = frozenset(['button', 'input']) + image_formats = frozenset(['jpg', 'jpeg', 'png', 'webp', 'avif', 'gif']) # Pre-fetch commonly used attributes style = img.get('style', '') alt = img.get('alt', '') src = img.get('src', '') data_src = img.get('data-src', '') + srcset = img.get('srcset', '') + data_srcset = img.get('data-srcset', '') width = img.get('width') height = img.get('height') parent = img.parent @@ -228,14 +334,36 @@ class WebScrapingStrategy(ContentScrapingStrategy): score += 1 score += index/total_images < 0.5 - image_format = '' - if "data:image/" in src: - image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] - else: - image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] + # image_format = '' + # if "data:image/" in src: + # image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] + # else: + # image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] - if image_format in ('jpg', 'png', 'webp', 'avif'): + # if image_format in ('jpg', 'png', 'webp', 'avif'): + # score += 1 + + + # Check for image format in all possible sources + def has_image_format(url): + return any(fmt in url.lower() for fmt in image_formats) + + # Score for having proper image sources + if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]): score += 1 + if srcset or data_srcset: + score += 1 + if img.find_parent('picture'): + score += 1 + + # Detect format from any available source + detected_format = None + for url in [src, data_src, srcset, data_srcset]: + if url: + format_matches = [fmt for fmt in image_formats if fmt in url.lower()] + if format_matches: + detected_format = format_matches[0] + break if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): return None @@ -254,7 +382,8 @@ class WebScrapingStrategy(ContentScrapingStrategy): 'desc': self.find_closest_parent_with_useful_text(img, **kwargs), 'score': score, 'type': 'image', - 'group_id': group_id # Group ID for this set of variants + 'group_id': group_id, # Group ID for this set of variants + 'format': detected_format, } # Inline function for adding variants @@ -287,8 +416,24 @@ class WebScrapingStrategy(ContentScrapingStrategy): return image_variants if image_variants else None - def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]: + """ + Process an HTML element. + + How it works: + 1. Check if the element is an image, video, or audio. + 2. Extract the element's attributes and content. + 3. Process the element based on its type. + 4. Return the processed element information. + + Args: + url (str): The URL of the page containing the element. + element (Tag): The HTML element to process. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the processed element information. + """ media = {'images': [], 'videos': [], 'audios': []} internal_links_dict = {} external_links_dict = {} @@ -307,6 +452,9 @@ class WebScrapingStrategy(ContentScrapingStrategy): } def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool: + """ + Process an HTML element. + """ try: if isinstance(element, NavigableString): if isinstance(element, Comment): @@ -316,6 +464,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): # if element.name == 'img': # process_image(element, url, 0, 1) # return True + base_domain = kwargs.get("base_domain", get_base_domain(url)) if element.name in ['script', 'style', 'link', 'meta', 'noscript']: element.decompose() @@ -323,8 +472,10 @@ class WebScrapingStrategy(ContentScrapingStrategy): keep_element = False - exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) - exclude_social_media_domains = list(set(exclude_social_media_domains)) + exclude_domains = kwargs.get('exclude_domains', []) + # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) + # exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) + # exclude_social_media_domains = list(set(exclude_social_media_domains)) try: if element.name == 'a' and element.get('href'): @@ -344,33 +495,43 @@ class WebScrapingStrategy(ContentScrapingStrategy): link_data = { 'href': normalized_href, 'text': element.get_text().strip(), - 'title': element.get('title', '').strip() + 'title': element.get('title', '').strip(), + 'base_domain': base_domain } + + is_external = is_external_url(normalized_href, base_domain) + + keep_element = True - # Check for duplicates and add to appropriate dictionary - is_external = is_external_url(normalized_href, url_base) + # Handle external link exclusions + if is_external: + link_base_domain = get_base_domain(normalized_href) + link_data['base_domain'] = link_base_domain + if kwargs.get('exclude_external_links', False): + element.decompose() + return False + # elif kwargs.get('exclude_social_media_links', False): + # if link_base_domain in exclude_social_media_domains: + # element.decompose() + # return False + # if any(domain in normalized_href.lower() for domain in exclude_social_media_domains): + # element.decompose() + # return False + elif exclude_domains: + if link_base_domain in exclude_domains: + element.decompose() + return False + # if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): + # element.decompose() + # return False + if is_external: if normalized_href not in external_links_dict: external_links_dict[normalized_href] = link_data else: if normalized_href not in internal_links_dict: internal_links_dict[normalized_href] = link_data - - keep_element = True - - # Handle external link exclusions - if is_external: - if kwargs.get('exclude_external_links', False): - element.decompose() - return False - elif kwargs.get('exclude_social_media_links', False): - if any(domain in normalized_href.lower() for domain in exclude_social_media_domains): - element.decompose() - return False - elif kwargs.get('exclude_domains', []): - if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])): - element.decompose() - return False + except Exception as e: raise Exception(f"Error processing links: {str(e)}") @@ -389,26 +550,40 @@ class WebScrapingStrategy(ContentScrapingStrategy): if 'srcset' in element.attrs: src = element.attrs['srcset'].split(',')[0].split(' ')[0] + # If image src is internal, then skip + if not is_external_url(src, base_domain): + return True + + image_src_base_domain = get_base_domain(src) + # Check flag if we should remove external images if kwargs.get('exclude_external_images', False): - src_url_base = src.split('/')[2] - url_base = url.split('/')[2] - if url_base not in src_url_base: - element.decompose() - return False + element.decompose() + return False + # src_url_base = src.split('/')[2] + # url_base = url.split('/')[2] + # if url_base not in src_url_base: + # element.decompose() + # return False - if not kwargs.get('exclude_external_images', False) and kwargs.get('exclude_social_media_links', False): - src_url_base = src.split('/')[2] - url_base = url.split('/')[2] - if any(domain in src for domain in exclude_social_media_domains): - element.decompose() - return False + # if kwargs.get('exclude_social_media_links', False): + # if image_src_base_domain in exclude_social_media_domains: + # element.decompose() + # return False + # src_url_base = src.split('/')[2] + # url_base = url.split('/')[2] + # if any(domain in src for domain in exclude_social_media_domains): + # element.decompose() + # return False # Handle exclude domains - if kwargs.get('exclude_domains', []): - if any(domain in src for domain in kwargs.get('exclude_domains', [])): + if exclude_domains: + if image_src_base_domain in exclude_domains: element.decompose() return False + # if any(domain in src for domain in kwargs.get('exclude_domains', [])): + # element.decompose() + # return False return True # Always keep image elements except Exception as e: @@ -480,12 +655,27 @@ class WebScrapingStrategy(ContentScrapingStrategy): return False def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: + """ + Extract content from HTML using BeautifulSoup. + + Args: + url (str): The URL of the page to scrape. + html (str): The HTML content of the page to scrape. + word_count_threshold (int): The minimum word count threshold for content extraction. + css_selector (str): The CSS selector to use for content extraction. + **kwargs: Additional keyword arguments. + + Returns: + dict: A dictionary containing the extracted content. + """ success = True if not html: return None - soup = BeautifulSoup(html, 'lxml') + parser_type = kwargs.get('parser', 'lxml') + soup = BeautifulSoup(html, parser_type) body = soup.body + base_domain = get_base_domain(url) try: meta = extract_metadata("", soup) @@ -531,10 +721,16 @@ class WebScrapingStrategy(ContentScrapingStrategy): for el in selected_elements: body.append(el) + kwargs['exclude_social_media_domains'] = set(kwargs.get('exclude_social_media_domains', []) + SOCIAL_MEDIA_DOMAINS) + kwargs['exclude_domains'] = set(kwargs.get('exclude_domains', [])) + if kwargs.get('exclude_social_media_links', False): + kwargs['exclude_domains'] = kwargs['exclude_domains'].union(kwargs['exclude_social_media_domains']) + result_obj = self.process_element( url, body, word_count_threshold = word_count_threshold, + base_domain=base_domain, **kwargs ) diff --git a/crawl4ai/docs_manager.py b/crawl4ai/docs_manager.py new file mode 100644 index 00000000..aacc5812 --- /dev/null +++ b/crawl4ai/docs_manager.py @@ -0,0 +1,67 @@ +import requests +import shutil +from pathlib import Path +from crawl4ai.async_logger import AsyncLogger +from crawl4ai.llmtxt import AsyncLLMTextManager + +class DocsManager: + def __init__(self, logger=None): + self.docs_dir = Path.home() / ".crawl4ai" / "docs" + self.local_docs = Path(__file__).parent.parent / "docs" / "llm.txt" + self.docs_dir.mkdir(parents=True, exist_ok=True) + self.logger = logger or AsyncLogger(verbose=True) + self.llm_text = AsyncLLMTextManager(self.docs_dir, self.logger) + + async def ensure_docs_exist(self): + """Fetch docs if not present""" + if not any(self.docs_dir.iterdir()): + await self.fetch_docs() + + async def fetch_docs(self) -> bool: + """Copy from local docs or download from GitHub""" + try: + # Try local first + if self.local_docs.exists() and (any(self.local_docs.glob("*.md")) or any(self.local_docs.glob("*.tokens"))): + # Empty the local docs directory + for file_path in self.docs_dir.glob("*.md"): + file_path.unlink() + # for file_path in self.docs_dir.glob("*.tokens"): + # file_path.unlink() + for file_path in self.local_docs.glob("*.md"): + shutil.copy2(file_path, self.docs_dir / file_path.name) + # for file_path in self.local_docs.glob("*.tokens"): + # shutil.copy2(file_path, self.docs_dir / file_path.name) + return True + + # Fallback to GitHub + response = requests.get( + "https://api.github.com/repos/unclecode/crawl4ai/contents/docs/llm.txt", + headers={'Accept': 'application/vnd.github.v3+json'} + ) + response.raise_for_status() + + for item in response.json(): + if item['type'] == 'file' and item['name'].endswith('.md'): + content = requests.get(item['download_url']).text + with open(self.docs_dir / item['name'], 'w', encoding='utf-8') as f: + f.write(content) + return True + + except Exception as e: + self.logger.error(f"Failed to fetch docs: {str(e)}") + raise + + def list(self) -> list[str]: + """List available topics""" + names = [file_path.stem for file_path in self.docs_dir.glob("*.md")] + # Remove [0-9]+_ prefix + names = [name.split("_", 1)[1] if name[0].isdigit() else name for name in names] + # Exclude those end with .xs.md and .q.md + names = [name for name in names if not name.endswith(".xs") and not name.endswith(".q")] + return names + + def generate(self, sections, mode="extended"): + return self.llm_text.generate(sections, mode) + + def search(self, query: str, top_k: int = 5): + return self.llm_text.search(query, top_k) \ No newline at end of file diff --git a/crawl4ai/extraction_strategy.bak.py b/crawl4ai/extraction_strategy.bak.py new file mode 100644 index 00000000..2048c0ff --- /dev/null +++ b/crawl4ai/extraction_strategy.bak.py @@ -0,0 +1,1440 @@ +from abc import ABC, abstractmethod +from typing import Any, List, Dict, Optional, Union +from concurrent.futures import ThreadPoolExecutor, as_completed +import json, time +# from optimum.intel import IPEXModel +from .prompts import * +from .config import * +from .utils import * +from .models import * +from functools import partial +from .model_loader import * +import math +import numpy as np +import re +from bs4 import BeautifulSoup +from lxml import html, etree +from dataclasses import dataclass + +class ExtractionStrategy(ABC): + """ + Abstract base class for all extraction strategies. + """ + + def __init__(self, input_format: str = "markdown", **kwargs): + """ + Initialize the extraction strategy. + + Args: + input_format: Content format to use for extraction. + Options: "markdown" (default), "html", "fit_markdown" + **kwargs: Additional keyword arguments + """ + self.input_format = input_format + self.DEL = "<|DEL|>" + self.name = self.__class__.__name__ + self.verbose = kwargs.get("verbose", False) + + @abstractmethod + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + + :param url: The URL of the webpage. + :param html: The HTML content of the webpage. + :return: A list of extracted blocks or chunks. + """ + pass + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections of text in parallel by default. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to process. + :return: A list of processed JSON blocks. + """ + extracted_content = [] + with ThreadPoolExecutor() as executor: + futures = [executor.submit(self.extract, url, section, **kwargs) for section in sections] + for future in as_completed(futures): + extracted_content.extend(future.result()) + return extracted_content + +class NoExtractionStrategy(ExtractionStrategy): + """ + A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block. + """ + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + """ + return [{"index": 0, "content": html}] + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] + +####################################################### +# Strategies using LLM-based extraction for text data # +####################################################### +class LLMExtractionStrategy(ExtractionStrategy): + """ + A strategy that uses an LLM to extract meaningful content from the HTML. + + Attributes: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + """ + + def __init__(self, + provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, + instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + + """ + super().__init__(**kwargs) + self.provider = provider + self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") + self.instruction = instruction + self.extract_type = extraction_type + self.schema = schema + if schema: + self.extract_type = "schema" + + self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) + self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) + self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.apply_chunking = kwargs.get("apply_chunking", True) + self.base_url = kwargs.get("base_url", None) + self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) + self.extra_args = kwargs.get("extra_args", {}) + if not self.apply_chunking: + self.chunk_token_threshold = 1e9 + + self.verbose = kwargs.get("verbose", False) + self.usages = [] # Store individual usages + self.total_usage = TokenUsage() # Accumulated usage + + if not self.api_token: + raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") + + + def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML using an LLM. + + How it works: + 1. Construct a prompt with variables. + 2. Make a request to the LLM using the prompt. + 3. Parse the response and extract blocks or chunks. + + Args: + url: The URL of the webpage. + ix: Index of the block. + html: The HTML content of the webpage. + + Returns: + A list of extracted blocks or chunks. + """ + if self.verbose: + # print("[LOG] Extracting blocks from URL:", url) + print(f"[LOG] Call LLM for {url} - block index: {ix}") + + variable_values = { + "URL": url, + "HTML": escape_json_string(sanitize_html(html)), + } + + prompt_with_variables = PROMPT_EXTRACT_BLOCKS + if self.instruction: + variable_values["REQUEST"] = self.instruction + prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION + + if self.extract_type == "schema" and self.schema: + variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) + prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION + + for variable in variable_values: + prompt_with_variables = prompt_with_variables.replace( + "{" + variable + "}", variable_values[variable] + ) + + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.api_base or self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} + ) + self.usages.append(usage) + + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + + try: + blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] + blocks = json.loads(blocks) + for block in blocks: + block['error'] = False + except Exception as e: + parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) + blocks = parsed + if unparsed: + blocks.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": unparsed + }) + + if self.verbose: + print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) + return blocks + + def _merge(self, documents, chunk_token_threshold, overlap): + """ + Merge documents into sections based on chunk_token_threshold and overlap. + """ + chunks = [] + sections = [] + total_tokens = 0 + + # Calculate the total tokens across all documents + for document in documents: + total_tokens += len(document.split(' ')) * self.word_token_rate + + # Calculate the number of sections needed + num_sections = math.floor(total_tokens / chunk_token_threshold) + if num_sections < 1: + num_sections = 1 # Ensure there is at least one section + adjusted_chunk_threshold = total_tokens / num_sections + + total_token_so_far = 0 + current_chunk = [] + + for document in documents: + tokens = document.split(' ') + token_count = len(tokens) * self.word_token_rate + + if total_token_so_far + token_count <= adjusted_chunk_threshold: + current_chunk.extend(tokens) + total_token_so_far += token_count + else: + # Ensure to handle the last section properly + if len(sections) == num_sections - 1: + current_chunk.extend(tokens) + continue + + # Add overlap if specified + if overlap > 0 and current_chunk: + overlap_tokens = current_chunk[-overlap:] + current_chunk.extend(overlap_tokens) + + sections.append(' '.join(current_chunk)) + current_chunk = tokens + total_token_so_far = token_count + + # Add the last chunk + if current_chunk: + sections.append(' '.join(current_chunk)) + + return sections + + + def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: + """ + Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. + + Args: + url: The URL of the webpage. + sections: List of sections (strings) to process. + + Returns: + A list of extracted blocks or chunks. + """ + + merged_sections = self._merge( + sections, self.chunk_token_threshold, + overlap= int(self.chunk_token_threshold * self.overlap_rate) + ) + extracted_content = [] + if self.provider.startswith("groq/"): + # Sequential processing with a delay + for ix, section in enumerate(merged_sections): + extract_func = partial(self.extract, url) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) + time.sleep(0.5) # 500 ms delay between each processing + else: + # Parallel processing using ThreadPoolExecutor + # extract_func = partial(self.extract, url) + # for ix, section in enumerate(merged_sections): + # extracted_content.append(extract_func(ix, section)) + + with ThreadPoolExecutor(max_workers=4) as executor: + extract_func = partial(self.extract, url) + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] + + for future in as_completed(futures): + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) + + + return extracted_content + + + def show_usage(self) -> None: + """Print a detailed token usage report showing total and per-request usage.""" + print("\n=== Token Usage Summary ===") + print(f"{'Type':<15} {'Count':>12}") + print("-" * 30) + print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") + print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") + print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") + + print("\n=== Usage History ===") + print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") + print("-" * 48) + for i, usage in enumerate(self.usages, 1): + print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") + +####################################################### +# Strategies using clustering for text data extraction # +####################################################### + +class CosineStrategy(ExtractionStrategy): + """ + Extract meaningful blocks or chunks from the given HTML using cosine similarity. + + How it works: + 1. Pre-filter documents using embeddings and semantic_filter. + 2. Perform clustering using cosine similarity. + 3. Organize texts by their cluster labels, retaining order. + 4. Filter clusters by word count. + 5. Extract meaningful blocks or chunks from the filtered clusters. + + Attributes: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + model_name (str): The name of the sentence-transformers model. + sim_threshold (float): The similarity threshold for clustering. + """ + def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): + """ + Initialize the strategy with clustering parameters. + + Args: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + """ + super().__init__(**kwargs) + + import numpy as np + + self.semantic_filter = semantic_filter + self.word_count_threshold = word_count_threshold + self.max_dist = max_dist + self.linkage_method = linkage_method + self.top_k = top_k + self.sim_threshold = sim_threshold + self.timer = time.time() + self.verbose = kwargs.get("verbose", False) + + self.buffer_embeddings = np.array([]) + self.get_embedding_method = "direct" + + self.device = get_device() + # import torch + # self.device = torch.device('cpu') + + self.default_batch_size = calculate_batch_size(self.device) + + if self.verbose: + print(f"[LOG] Loading Extraction Model for {self.device.type} device.") + + # if False and self.device.type == "cpu": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + # else: + + self.tokenizer, self.model = load_HF_embedding_model(model_name) + self.model.to(self.device) + self.model.eval() + + self.get_embedding_method = "batch" + + self.buffer_embeddings = np.array([]) + + # if model_name == "bert-base-uncased": + # self.tokenizer, self.model = load_bert_base_uncased() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "BAAI/bge-small-en-v1.5": + # self.tokenizer, self.model = load_bge_small_en_v1_5() + # self.model.eval() # Ensure the model is in evaluation mode + # self.get_embedding_method = "batch" + # elif model_name == "sentence-transformers/all-MiniLM-L6-v2": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + + + if self.verbose: + print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.") + + self.nlp, _ = load_text_multilabel_classifier() + # self.default_batch_size = 16 if self.device.type == 'cpu' else 64 + + if self.verbose: + print(f"[LOG] Model loaded {model_name}, models/reuters, took " + str(time.time() - self.timer) + " seconds") + + def filter_documents_embeddings(self, documents: List[str], semantic_filter: str, at_least_k: int = 20) -> List[str]: + """ + Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding. + + Args: + documents (List[str]): A list of document texts. + semantic_filter (str): A keyword filter for document filtering. + at_least_k (int): The minimum number of documents to return. + + Returns: + List[str]: A list of filtered and sorted document texts. + """ + + if not semantic_filter: + return documents + + if len(documents) < at_least_k: + at_least_k = len(documents) // 2 + + from sklearn.metrics.pairwise import cosine_similarity + + # Compute embedding for the keyword filter + query_embedding = self.get_embeddings([semantic_filter])[0] + + # Compute embeddings for the documents + document_embeddings = self.get_embeddings(documents) + + # Calculate cosine similarity between the query embedding and document embeddings + similarities = cosine_similarity([query_embedding], document_embeddings).flatten() + + # Filter documents based on the similarity threshold + filtered_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim >= self.sim_threshold] + + # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity + if len(filtered_docs) < at_least_k: + remaining_docs = [(doc, sim) for doc, sim in zip(documents, similarities) if sim < self.sim_threshold] + remaining_docs.sort(key=lambda x: x[1], reverse=True) + filtered_docs.extend(remaining_docs[:at_least_k - len(filtered_docs)]) + + # Extract the document texts from the tuples + filtered_docs = [doc for doc, _ in filtered_docs] + + return filtered_docs[:at_least_k] + + def get_embeddings(self, sentences: List[str], batch_size=None, bypass_buffer=False): + """ + Get BERT embeddings for a list of sentences. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of embeddings. + """ + # if self.buffer_embeddings.any() and not bypass_buffer: + # return self.buffer_embeddings + + if self.device.type in [ "cpu", "gpu", "cuda", "mps"]: + import torch + # Tokenize sentences and convert to tensor + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + encoded_input = self.tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt') + encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()} + + # Ensure no gradients are calculated + with torch.no_grad(): + model_output = self.model(**encoded_input) + + # Get embeddings from the last hidden state (mean pooling) + embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy() + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + elif self.device.type == "cpu": + # self.buffer_embeddings = self.model(sentences) + if batch_size is None: + batch_size = self.default_batch_size + + all_embeddings = [] + for i in range(0, len(sentences), batch_size): + batch_sentences = sentences[i:i + batch_size] + embeddings = self.model(batch_sentences) + all_embeddings.append(embeddings) + + self.buffer_embeddings = np.vstack(all_embeddings) + return self.buffer_embeddings + + def hierarchical_clustering(self, sentences: List[str], embeddings = None): + """ + Perform hierarchical clustering on sentences and return cluster labels. + + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of cluster labels. + """ + # Get embeddings + from scipy.cluster.hierarchy import linkage, fcluster + from scipy.spatial.distance import pdist + self.timer = time.time() + embeddings = self.get_embeddings(sentences, bypass_buffer=True) + # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds") + # Compute pairwise cosine distances + distance_matrix = pdist(embeddings, 'cosine') + # Perform agglomerative clustering respecting order + linked = linkage(distance_matrix, method=self.linkage_method) + # Form flat clusters + labels = fcluster(linked, self.max_dist, criterion='distance') + return labels + + def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]: + """ + Filter clusters to remove those with a word count below the threshold. + + Args: + clusters (Dict[int, List[str]]): Dictionary of clusters. + + Returns: + Dict[int, List[str]]: Filtered dictionary of clusters. + """ + filtered_clusters = {} + for cluster_id, texts in clusters.items(): + # Concatenate texts for analysis + full_text = " ".join(texts) + # Count words + word_count = len(full_text.split()) + + # Keep clusters with word count above the threshold + if word_count >= self.word_count_threshold: + filtered_clusters[cluster_id] = texts + + return filtered_clusters + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract clusters from HTML content using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + html (str): The HTML content of the webpage. + + Returns: + List[Dict[str, Any]]: A list of processed JSON blocks. + """ + # Assume `html` is a list of text chunks for this strategy + t = time.time() + text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed + + # Pre-filter documents using embeddings and semantic_filter + text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter) + + if not text_chunks: + return [] + + # Perform clustering + labels = self.hierarchical_clustering(text_chunks) + # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds") + + # Organize texts by their cluster labels, retaining order + t = time.time() + clusters = {} + for index, label in enumerate(labels): + clusters.setdefault(label, []).append(text_chunks[index]) + + # Filter clusters by word count + filtered_clusters = self.filter_clusters_by_word_count(clusters) + + # Convert filtered clusters to a sorted list of dictionaries + cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)] + + if self.verbose: + print(f"[LOG] 🚀 Assign tags using {self.device}") + + if self.device.type in ["gpu", "cuda", "mps", "cpu"]: + labels = self.nlp([cluster['content'] for cluster in cluster_list]) + + for cluster, label in zip(cluster_list, labels): + cluster['tags'] = label + # elif self.device.type == "cpu": + # # Process the text with the loaded model + # texts = [cluster['content'] for cluster in cluster_list] + # # Batch process texts + # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) + + # for doc, cluster in zip(docs, cluster_list): + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + # for cluster in cluster_list: + # doc = self.nlp(cluster['content']) + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] + + if self.verbose: + print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds") + + return cluster_list + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections using hierarchical clustering. + + Args: + url (str): The URL of the webpage. + sections (List[str]): List of sections (strings) to process. + + Returns: + """ + # This strategy processes all sections together + + return self.extract(url, self.DEL.join(sections), **kwargs) + +####################################################### +# New extraction strategies for JSON-based extraction # +####################################################### + +class JsonElementExtractionStrategy(ExtractionStrategy): + """ + Abstract base class for extracting structured JSON from HTML content. + + How it works: + 1. Parses HTML content using the `_parse_html` method. + 2. Uses a schema to define base selectors, fields, and transformations. + 3. Extracts data hierarchically, supporting nested fields and lists. + 4. Handles computed fields with expressions or functions. + + Attributes: + DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'. + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content. + _extract_item(element, fields): Extracts fields from a single element. + _extract_single_field(element, field): Extracts a single field based on its type. + _apply_transform(value, transform): Applies a transformation to a value. + _compute_field(item, field): Computes a field value using an expression or function. + run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy. + + Abstract Methods: + _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml). + _get_base_elements(parsed_html, selector): Retrieves base elements using a selector. + _get_elements(element, selector): Retrieves child elements using a selector. + _get_element_text(element): Extracts text content from an element. + _get_element_html(element): Extracts raw HTML from an element. + _get_element_attribute(element, attribute): Extracts an attribute's value from an element. + """ + + + DEL = '\n' + + def __init__(self, schema: Dict[str, Any], **kwargs): + """ + Initialize the JSON element extraction strategy with a schema. + + Args: + schema (Dict[str, Any]): The schema defining the extraction rules. + """ + super().__init__(**kwargs) + self.schema = schema + self.verbose = kwargs.get('verbose', False) + + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract structured data from HTML content. + + How it works: + 1. Parses the HTML content using the `_parse_html` method. + 2. Identifies base elements using the schema's base selector. + 3. Extracts fields from each base element using `_extract_item`. + + Args: + url (str): The URL of the page being processed. + html_content (str): The raw HTML content to parse and extract. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary. + """ + + parsed_html = self._parse_html(html_content) + base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector']) + + results = [] + for element in base_elements: + # Extract base element attributes + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + if item: + results.append(item) + + return results + + @abstractmethod + def _parse_html(self, html_content: str): + """Parse HTML content into appropriate format""" + pass + + @abstractmethod + def _get_base_elements(self, parsed_html, selector: str): + """Get all base elements using the selector""" + pass + + @abstractmethod + def _get_elements(self, element, selector: str): + """Get child elements using the selector""" + pass + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + nested_elements = self._get_elements(element, field['selector']) + nested_element = nested_elements[0] if nested_elements else None + return self._extract_item(nested_element, field['fields']) if nested_element else {} + + if field['type'] == 'list': + elements = self._get_elements(element, field['selector']) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + elements = self._get_elements(element, field['selector']) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_single_field(self, element, field): + """ + Extract a single field based on its type. + + How it works: + 1. Selects the target element using the field's selector. + 2. Extracts the field value based on its type (e.g., text, attribute, regex). + 3. Applies transformations if defined in the schema. + + Args: + element: The base element to extract the field from. + field (Dict[str, Any]): The field definition in the schema. + + Returns: + Any: The extracted field value. + """ + + if 'selector' in field: + selected = self._get_elements(element, field['selector']) + if not selected: + return field.get('default') + selected = selected[0] + else: + selected = element + + value = None + if field['type'] == 'text': + value = self._get_element_text(selected) + elif field['type'] == 'attribute': + value = self._get_element_attribute(selected, field['attribute']) + elif field['type'] == 'html': + value = self._get_element_html(selected) + elif field['type'] == 'regex': + text = self._get_element_text(selected) + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_item(self, element, fields): + """ + Extracts fields from a given element. + + How it works: + 1. Iterates through the fields defined in the schema. + 2. Handles computed, single, and nested field types. + 3. Updates the item dictionary with extracted field values. + + Args: + element: The base element to extract fields from. + fields (List[Dict[str, Any]]): The list of fields to extract. + + Returns: + Dict[str, Any]: A dictionary representing the extracted item. + """ + + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + """ + Apply a transformation to a value. + + How it works: + 1. Checks the transformation type (e.g., `lowercase`, `strip`). + 2. Applies the transformation to the value. + 3. Returns the transformed value. + + Args: + value (str): The value to transform. + transform (str): The type of transformation to apply. + + Returns: + str: The transformed value. + """ + + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Run the extraction strategy on a combined HTML content. + + How it works: + 1. Combines multiple HTML sections using the `DEL` delimiter. + 2. Calls the `extract` method with the combined HTML. + + Args: + url (str): The URL of the page being processed. + sections (List[str]): A list of HTML sections. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items. + """ + + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) + + @abstractmethod + def _get_element_text(self, element) -> str: + """Get text content from element""" + pass + + @abstractmethod + def _get_element_html(self, element) -> str: + """Get HTML content from element""" + pass + + @abstractmethod + def _get_element_attribute(self, element, attribute: str): + """Get attribute value from element""" + pass + +class JsonCssExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. + + How it works: + 1. Parses HTML content with BeautifulSoup. + 2. Selects elements using CSS selectors defined in the schema. + 3. Extracts field data and applies transformations as defined. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into a BeautifulSoup object. + _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector. + _get_elements(element, selector): Selects child elements using a CSS selector. + _get_element_text(element): Extracts text content from a BeautifulSoup element. + _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element. + _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return BeautifulSoup(html_content, 'html.parser') + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.select(selector) + + def _get_elements(self, element, selector: str): + selected = element.select_one(selector) + return [selected] if selected else [] + + def _get_element_text(self, element) -> str: + return element.get_text(strip=True) + + def _get_element_html(self, element) -> str: + return str(element) + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + +class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors. + + How it works: + 1. Parses HTML content into an lxml tree. + 2. Selects elements using XPath expressions. + 3. Converts CSS selectors to XPath when needed. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into an lxml tree. + _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector. + _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression. + _get_elements(element, selector): Selects child elements using an XPath selector. + _get_element_text(element): Extracts text content from an lxml element. + _get_element_html(element): Extracts the raw HTML content of an lxml element. + _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return html.fromstring(html_content) + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.xpath(selector) + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _get_elements(self, element, selector: str): + xpath = self._css_to_xpath(selector) + if not xpath.startswith('.'): + xpath = '.' + xpath + return element.xpath(xpath) + + def _get_element_text(self, element) -> str: + return ''.join(element.xpath('.//text()')).strip() + + def _get_element_html(self, element) -> str: + return etree.tostring(element, encoding='unicode') + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + + +####################################################### +# Strategies based on the extraction of specific types# +####################################################### + +class TopicExtractionStrategy(ExtractionStrategy): + def __init__(self, num_keywords: int = 3, **kwargs): + """ + Initialize the topic extraction strategy with parameters for topic segmentation. + + :param num_keywords: Number of keywords to represent each topic segment. + """ + import nltk + super().__init__(**kwargs) + self.num_keywords = num_keywords + self.tokenizer = nltk.TextTilingTokenizer() + + def extract_keywords(self, text: str) -> List[str]: + """ + Extract keywords from a given text segment using simple frequency analysis. + + :param text: The text segment from which to extract keywords. + :return: A list of keyword strings. + """ + import nltk + # Tokenize the text and compute word frequency + words = nltk.word_tokenize(text) + freq_dist = nltk.FreqDist(words) + # Get the most common words as keywords + keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)] + return keywords + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract topics from HTML content using TextTiling for segmentation and keyword extraction. + + :param url: The URL of the webpage. + :param html: The HTML content of the webpage. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of dictionaries representing the topics. + """ + # Use TextTiling to segment the text into topics + segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed + + # Prepare the output as a list of dictionaries + topic_list = [] + for i, segment in enumerate(segmented_topics): + # Extract keywords for each segment + keywords = self.extract_keywords(segment) + topic_list.append({ + "index": i, + "content": segment, + "keywords": keywords + }) + + return topic_list + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Process sections using topic segmentation and keyword extraction. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to process. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of processed JSON blocks. + """ + # Concatenate sections into a single text for coherent topic segmentation + + + return self.extract(url, self.DEL.join(sections), **kwargs) + +class ContentSummarizationStrategy(ExtractionStrategy): + def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs): + """ + Initialize the content summarization strategy with a specific model. + + :param model_name: The model to use for summarization. + """ + super().__init__(**kwargs) + from transformers import pipeline + self.summarizer = pipeline("summarization", model=model_name) + + def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Summarize a single section of text. + + :param url: The URL of the webpage. + :param text: A section of text to summarize. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A dictionary with the summary. + """ + try: + summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False) + return {"summary": summary[0]['summary_text']} + except Exception as e: + print(f"Error summarizing text: {e}") + return {"summary": text} # Fallback to original text if summarization fails + + def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: + """ + Process each section in parallel to produce summaries. + + :param url: The URL of the webpage. + :param sections: List of sections (strings) to summarize. + :param provider: The provider to be used for extraction (not used here). + :param api_token: Optional API token for the provider (not used here). + :return: A list of dictionaries with summaries for each section. + """ + # Use a ThreadPoolExecutor to summarize in parallel + summaries = [] + with ThreadPoolExecutor() as executor: + # Create a future for each section's summarization + future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)} + for future in as_completed(future_to_section): + section_index = future_to_section[future] + try: + summary_result = future.result() + summaries.append((section_index, summary_result)) + except Exception as e: + print(f"Error processing section {section_index}: {e}") + summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text + + # Sort summaries by the original section index to maintain order + summaries.sort(key=lambda x: x[0]) + return [summary for _, summary in summaries] + +####################################################### +# Deprecated strategies +####################################################### + +class _JsonCssExtractionStrategy(ExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(**kwargs) + self.schema = schema + + def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + soup = BeautifulSoup(html, 'html.parser') + base_elements = soup.select(self.schema['baseSelector']) + + results = [] + for element in base_elements: + # Extract base element attributes first + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Then extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + results.append(item) + + return results + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + nested_element = element.select_one(field['selector']) + return self._extract_item(nested_element, field['fields']) if nested_element else {} + + if field['type'] == 'list': + elements = element.select(field['selector']) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + elements = element.select(field['selector']) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_single_field(self, element, field): + if 'selector' in field: + selected = element.select_one(field['selector']) + if not selected: + return field.get('default') + else: + selected = element + + value = None + if field['type'] == 'text': + value = selected.get_text(strip=True) + elif field['type'] == 'attribute': + value = selected.get(field['attribute']) + elif field['type'] == 'html': + value = str(selected) + elif field['type'] == 'regex': + text = selected.get_text(strip=True) + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_item(self, element, fields): + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) +class _JsonXPathExtractionStrategy(ExtractionStrategy): + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(**kwargs) + self.schema = schema + + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + tree = html.fromstring(html_content) + base_xpath = self.schema['baseSelector'] + base_elements = tree.xpath(base_xpath) + + results = [] + for element in base_elements: + # Extract base element attributes first + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Then extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + + results.append(item) + + return results + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + else: + # Fallback to basic conversion for common cases + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + # Handle basic cases + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _extract_field(self, element, field): + try: + if field['type'] == 'nested': + xpath = self._css_to_xpath(field['selector']) + nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None + return self._extract_item(nested_element, field['fields']) if nested_element is not None else {} + + if field['type'] == 'list': + xpath = self._css_to_xpath(field['selector']) + elements = element.xpath(xpath) + return [self._extract_list_item(el, field['fields']) for el in elements] + + if field['type'] == 'nested_list': + xpath = self._css_to_xpath(field['selector']) + elements = element.xpath(xpath) + return [self._extract_item(el, field['fields']) for el in elements] + + return self._extract_single_field(element, field) + except Exception as e: + if self.verbose: + print(f"Error extracting field {field['name']}: {str(e)}") + return field.get('default') + + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _extract_single_field(self, element, field): + if 'selector' in field: + xpath = self._css_to_xpath(field['selector']) + selected = element.xpath(xpath) + if not selected: + return field.get('default') + selected = selected[0] + else: + selected = element + + value = None + if field['type'] == 'text': + value = ''.join(selected.xpath('.//text()')).strip() + elif field['type'] == 'attribute': + value = selected.get(field['attribute']) + elif field['type'] == 'html': + value = etree.tostring(selected, encoding='unicode') + elif field['type'] == 'regex': + text = ''.join(selected.xpath('.//text()')).strip() + match = re.search(field['pattern'], text) + value = match.group(1) if match else None + + if 'transform' in field: + value = self._apply_transform(value, field['transform']) + + return value if value is not None else field.get('default') + + def _extract_item(self, element, fields): + item = {} + for field in fields: + if field['type'] == 'computed': + value = self._compute_field(item, field) + else: + value = self._extract_field(element, field) + if value is not None: + item[field['name']] = value + return item + + def _apply_transform(self, value, transform): + if transform == 'lowercase': + return value.lower() + elif transform == 'uppercase': + return value.upper() + elif transform == 'strip': + return value.strip() + return value + + def _compute_field(self, item, field): + try: + if 'expression' in field: + return eval(field['expression'], {}, item) + elif 'function' in field: + return field['function'](item) + except Exception as e: + if self.verbose: + print(f"Error computing field {field['name']}: {str(e)}") + return field.get('default') + + def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + combined_html = self.DEL.join(sections) + return self.extract(url, combined_html, **kwargs) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 50e5da36..7441e32d 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -6,18 +6,31 @@ import json, time from .prompts import * from .config import * from .utils import * +from .models import * from functools import partial from .model_loader import * import math import numpy as np -from lxml import etree +import re +from bs4 import BeautifulSoup +from lxml import html, etree +from dataclasses import dataclass class ExtractionStrategy(ABC): """ Abstract base class for all extraction strategies. """ - def __init__(self, **kwargs): + def __init__(self, input_format: str = "markdown", **kwargs): + """ + Initialize the extraction strategy. + + Args: + input_format: Content format to use for extraction. + Options: "markdown" (default), "html", "fit_markdown" + **kwargs: Additional keyword arguments + """ + self.input_format = input_format self.DEL = "<|DEL|>" self.name = self.__class__.__name__ self.verbose = kwargs.get("verbose", False) @@ -49,24 +62,68 @@ class ExtractionStrategy(ABC): return extracted_content class NoExtractionStrategy(ExtractionStrategy): + """ + A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block. + """ def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML. + """ return [{"index": 0, "content": html}] def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] - + +####################################################### +# Strategies using LLM-based extraction for text data # +####################################################### class LLMExtractionStrategy(ExtractionStrategy): + """ + A strategy that uses an LLM to extract meaningful content from the HTML. + + Attributes: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. + """ + def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): """ Initialize the strategy with clustering parameters. + + Args: + provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". + api_token: The API token for the provider. + instruction: The instruction to use for the LLM model. + schema: Pydantic model schema for structured data. + extraction_type: "block" or "schema". + chunk_token_threshold: Maximum tokens per chunk. + overlap_rate: Overlap between chunks. + word_token_rate: Word to token conversion rate. + apply_chunking: Whether to apply chunking. + base_url: The base URL for the API request. + api_base: The base URL for the API request. + extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + verbose: Whether to print verbose output. + usages: List of individual token usages. + total_usage: Accumulated token usage. - :param provider: The provider to use for extraction. - :param api_token: The API token for the provider. - :param instruction: The instruction to use for the LLM model. """ - super().__init__() + super().__init__(**kwargs) self.provider = provider self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") self.instruction = instruction @@ -86,12 +143,30 @@ class LLMExtractionStrategy(ExtractionStrategy): self.chunk_token_threshold = 1e9 self.verbose = kwargs.get("verbose", False) + self.usages = [] # Store individual usages + self.total_usage = TokenUsage() # Accumulated usage if not self.api_token: raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: + """ + Extract meaningful blocks or chunks from the given HTML using an LLM. + + How it works: + 1. Construct a prompt with variables. + 2. Make a request to the LLM using the prompt. + 3. Parse the response and extract blocks or chunks. + + Args: + url: The URL of the webpage. + ix: Index of the block. + html: The HTML content of the webpage. + + Returns: + A list of extracted blocks or chunks. + """ if self.verbose: # print("[LOG] Extracting blocks from URL:", url) print(f"[LOG] Call LLM for {url} - block index: {ix}") @@ -122,6 +197,21 @@ class LLMExtractionStrategy(ExtractionStrategy): base_url=self.api_base or self.base_url, extra_args = self.extra_args ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ if response.usage.completion_tokens_details else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ if response.usage.prompt_tokens_details else {} + ) + self.usages.append(usage) + + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + try: blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] blocks = json.loads(blocks) @@ -143,6 +233,9 @@ class LLMExtractionStrategy(ExtractionStrategy): return blocks def _merge(self, documents, chunk_token_threshold, overlap): + """ + Merge documents into sections based on chunk_token_threshold and overlap. + """ chunks = [] sections = [] total_tokens = 0 @@ -192,6 +285,13 @@ class LLMExtractionStrategy(ExtractionStrategy): def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: """ Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. + + Args: + url: The URL of the webpage. + sections: List of sections (strings) to process. + + Returns: + A list of extracted blocks or chunks. """ merged_sections = self._merge( @@ -231,8 +331,47 @@ class LLMExtractionStrategy(ExtractionStrategy): return extracted_content + + + def show_usage(self) -> None: + """Print a detailed token usage report showing total and per-request usage.""" + print("\n=== Token Usage Summary ===") + print(f"{'Type':<15} {'Count':>12}") + print("-" * 30) + print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}") + print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}") + print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}") + + print("\n=== Usage History ===") + print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}") + print("-" * 48) + for i, usage in enumerate(self.usages, 1): + print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}") +####################################################### +# Strategies using clustering for text data extraction # +####################################################### + class CosineStrategy(ExtractionStrategy): + """ + Extract meaningful blocks or chunks from the given HTML using cosine similarity. + + How it works: + 1. Pre-filter documents using embeddings and semantic_filter. + 2. Perform clustering using cosine similarity. + 3. Organize texts by their cluster labels, retaining order. + 4. Filter clusters by word count. + 5. Extract meaningful blocks or chunks from the filtered clusters. + + Attributes: + semantic_filter (str): A keyword filter for document filtering. + word_count_threshold (int): Minimum number of words per cluster. + max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. + linkage_method (str): The linkage method for hierarchical clustering. + top_k (int): Number of top categories to extract. + model_name (str): The name of the sentence-transformers model. + sim_threshold (float): The similarity threshold for clustering. + """ def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): """ Initialize the strategy with clustering parameters. @@ -244,7 +383,7 @@ class CosineStrategy(ExtractionStrategy): linkage_method (str): The linkage method for hierarchical clustering. top_k (int): Number of top categories to extract. """ - super().__init__() + super().__init__(**kwargs) import numpy as np @@ -310,11 +449,13 @@ class CosineStrategy(ExtractionStrategy): """ Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding. - :param documents: List of text chunks (documents). - :param semantic_filter: A string containing the keywords for filtering. - :param threshold: Cosine similarity threshold for filtering documents. - :param at_least_k: Minimum number of documents to return. - :return: List of filtered documents, ensuring at least `at_least_k` documents. + Args: + documents (List[str]): A list of document texts. + semantic_filter (str): A keyword filter for document filtering. + at_least_k (int): The minimum number of documents to return. + + Returns: + List[str]: A list of filtered and sorted document texts. """ if not semantic_filter: @@ -352,8 +493,11 @@ class CosineStrategy(ExtractionStrategy): """ Get BERT embeddings for a list of sentences. - :param sentences: List of text chunks (sentences). - :return: NumPy array of embeddings. + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of embeddings. """ # if self.buffer_embeddings.any() and not bypass_buffer: # return self.buffer_embeddings @@ -397,8 +541,11 @@ class CosineStrategy(ExtractionStrategy): """ Perform hierarchical clustering on sentences and return cluster labels. - :param sentences: List of text chunks (sentences). - :return: NumPy array of cluster labels. + Args: + sentences (List[str]): A list of text chunks (sentences). + + Returns: + NumPy array of cluster labels. """ # Get embeddings from scipy.cluster.hierarchy import linkage, fcluster @@ -414,12 +561,15 @@ class CosineStrategy(ExtractionStrategy): labels = fcluster(linked, self.max_dist, criterion='distance') return labels - def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]): + def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]: """ Filter clusters to remove those with a word count below the threshold. - :param clusters: Dictionary of clusters. - :return: Filtered dictionary of clusters. + Args: + clusters (Dict[int, List[str]]): Dictionary of clusters. + + Returns: + Dict[int, List[str]]: Filtered dictionary of clusters. """ filtered_clusters = {} for cluster_id, texts in clusters.items(): @@ -438,9 +588,12 @@ class CosineStrategy(ExtractionStrategy): """ Extract clusters from HTML content using hierarchical clustering. - :param url: The URL of the webpage. - :param html: The HTML content of the webpage. - :return: A list of dictionaries representing the clusters. + Args: + url (str): The URL of the webpage. + html (str): The HTML content of the webpage. + + Returns: + List[Dict[str, Any]]: A list of processed JSON blocks. """ # Assume `html` is a list of text chunks for this strategy t = time.time() @@ -502,170 +655,135 @@ class CosineStrategy(ExtractionStrategy): """ Process sections using hierarchical clustering. - :param url: The URL of the webpage. - :param sections: List of sections (strings) to process. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of processed JSON blocks. + Args: + url (str): The URL of the webpage. + sections (List[str]): List of sections (strings) to process. + + Returns: """ # This strategy processes all sections together return self.extract(url, self.DEL.join(sections), **kwargs) -class TopicExtractionStrategy(ExtractionStrategy): - def __init__(self, num_keywords: int = 3, **kwargs): - """ - Initialize the topic extraction strategy with parameters for topic segmentation. +####################################################### +# New extraction strategies for JSON-based extraction # +####################################################### - :param num_keywords: Number of keywords to represent each topic segment. - """ - import nltk - super().__init__() - self.num_keywords = num_keywords - self.tokenizer = nltk.TextTilingTokenizer() +class JsonElementExtractionStrategy(ExtractionStrategy): + """ + Abstract base class for extracting structured JSON from HTML content. - def extract_keywords(self, text: str) -> List[str]: - """ - Extract keywords from a given text segment using simple frequency analysis. + How it works: + 1. Parses HTML content using the `_parse_html` method. + 2. Uses a schema to define base selectors, fields, and transformations. + 3. Extracts data hierarchically, supporting nested fields and lists. + 4. Handles computed fields with expressions or functions. - :param text: The text segment from which to extract keywords. - :return: A list of keyword strings. - """ - import nltk - # Tokenize the text and compute word frequency - words = nltk.word_tokenize(text) - freq_dist = nltk.FreqDist(words) - # Get the most common words as keywords - keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)] - return keywords + Attributes: + DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'. + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: - """ - Extract topics from HTML content using TextTiling for segmentation and keyword extraction. + Methods: + extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content. + _extract_item(element, fields): Extracts fields from a single element. + _extract_single_field(element, field): Extracts a single field based on its type. + _apply_transform(value, transform): Applies a transformation to a value. + _compute_field(item, field): Computes a field value using an expression or function. + run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy. - :param url: The URL of the webpage. - :param html: The HTML content of the webpage. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of dictionaries representing the topics. - """ - # Use TextTiling to segment the text into topics - segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed + Abstract Methods: + _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml). + _get_base_elements(parsed_html, selector): Retrieves base elements using a selector. + _get_elements(element, selector): Retrieves child elements using a selector. + _get_element_text(element): Extracts text content from an element. + _get_element_html(element): Extracts raw HTML from an element. + _get_element_attribute(element, attribute): Extracts an attribute's value from an element. + """ - # Prepare the output as a list of dictionaries - topic_list = [] - for i, segment in enumerate(segmented_topics): - # Extract keywords for each segment - keywords = self.extract_keywords(segment) - topic_list.append({ - "index": i, - "content": segment, - "keywords": keywords - }) - - return topic_list - - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: - """ - Process sections using topic segmentation and keyword extraction. - - :param url: The URL of the webpage. - :param sections: List of sections (strings) to process. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of processed JSON blocks. - """ - # Concatenate sections into a single text for coherent topic segmentation - - - return self.extract(url, self.DEL.join(sections), **kwargs) -class ContentSummarizationStrategy(ExtractionStrategy): - def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs): - """ - Initialize the content summarization strategy with a specific model. + DEL = '\n' - :param model_name: The model to use for summarization. - """ - from transformers import pipeline - self.summarizer = pipeline("summarization", model=model_name) - - def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: - """ - Summarize a single section of text. - - :param url: The URL of the webpage. - :param text: A section of text to summarize. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A dictionary with the summary. - """ - try: - summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False) - return {"summary": summary[0]['summary_text']} - except Exception as e: - print(f"Error summarizing text: {e}") - return {"summary": text} # Fallback to original text if summarization fails - - def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]: - """ - Process each section in parallel to produce summaries. - - :param url: The URL of the webpage. - :param sections: List of sections (strings) to summarize. - :param provider: The provider to be used for extraction (not used here). - :param api_token: Optional API token for the provider (not used here). - :return: A list of dictionaries with summaries for each section. - """ - # Use a ThreadPoolExecutor to summarize in parallel - summaries = [] - with ThreadPoolExecutor() as executor: - # Create a future for each section's summarization - future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)} - for future in as_completed(future_to_section): - section_index = future_to_section[future] - try: - summary_result = future.result() - summaries.append((section_index, summary_result)) - except Exception as e: - print(f"Error processing section {section_index}: {e}") - summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text - - # Sort summaries by the original section index to maintain order - summaries.sort(key=lambda x: x[0]) - return [summary for _, summary in summaries] - -class JsonCssExtractionStrategy(ExtractionStrategy): def __init__(self, schema: Dict[str, Any], **kwargs): + """ + Initialize the JSON element extraction strategy with a schema. + + Args: + schema (Dict[str, Any]): The schema defining the extraction rules. + """ super().__init__(**kwargs) self.schema = schema + self.verbose = kwargs.get('verbose', False) - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: - soup = BeautifulSoup(html, 'html.parser') - base_elements = soup.select(self.schema['baseSelector']) + def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]: + """ + Extract structured data from HTML content. + + How it works: + 1. Parses the HTML content using the `_parse_html` method. + 2. Identifies base elements using the schema's base selector. + 3. Extracts fields from each base element using `_extract_item`. + + Args: + url (str): The URL of the page being processed. + html_content (str): The raw HTML content to parse and extract. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary. + """ + + parsed_html = self._parse_html(html_content) + base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector']) results = [] for element in base_elements: - item = self._extract_item(element, self.schema['fields']) + # Extract base element attributes + item = {} + if 'baseFields' in self.schema: + for field in self.schema['baseFields']: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + + # Extract child fields + field_data = self._extract_item(element, self.schema['fields']) + item.update(field_data) + if item: results.append(item) return results - + @abstractmethod + def _parse_html(self, html_content: str): + """Parse HTML content into appropriate format""" + pass + + @abstractmethod + def _get_base_elements(self, parsed_html, selector: str): + """Get all base elements using the selector""" + pass + + @abstractmethod + def _get_elements(self, element, selector: str): + """Get child elements using the selector""" + pass def _extract_field(self, element, field): try: if field['type'] == 'nested': - nested_element = element.select_one(field['selector']) + nested_elements = self._get_elements(element, field['selector']) + nested_element = nested_elements[0] if nested_elements else None return self._extract_item(nested_element, field['fields']) if nested_element else {} if field['type'] == 'list': - elements = element.select(field['selector']) + elements = self._get_elements(element, field['selector']) return [self._extract_list_item(el, field['fields']) for el in elements] if field['type'] == 'nested_list': - elements = element.select(field['selector']) + elements = self._get_elements(element, field['selector']) return [self._extract_item(el, field['fields']) for el in elements] return self._extract_single_field(element, field) @@ -674,146 +792,25 @@ class JsonCssExtractionStrategy(ExtractionStrategy): print(f"Error extracting field {field['name']}: {str(e)}") return field.get('default') - def _extract_list_item(self, element, fields): - item = {} - for field in fields: - value = self._extract_single_field(element, field) - if value is not None: - item[field['name']] = value - return item - def _extract_single_field(self, element, field): - if 'selector' in field: - selected = element.select_one(field['selector']) - if not selected: - return field.get('default') - else: - selected = element + """ + Extract a single field based on its type. - value = None - if field['type'] == 'text': - value = selected.get_text(strip=True) - elif field['type'] == 'attribute': - value = selected.get(field['attribute']) - elif field['type'] == 'html': - value = str(selected) - elif field['type'] == 'regex': - text = selected.get_text(strip=True) - match = re.search(field['pattern'], text) - value = match.group(1) if match else None + How it works: + 1. Selects the target element using the field's selector. + 2. Extracts the field value based on its type (e.g., text, attribute, regex). + 3. Applies transformations if defined in the schema. - if 'transform' in field: - value = self._apply_transform(value, field['transform']) + Args: + element: The base element to extract the field from. + field (Dict[str, Any]): The field definition in the schema. - return value if value is not None else field.get('default') - - def _extract_item(self, element, fields): - item = {} - for field in fields: - if field['type'] == 'computed': - value = self._compute_field(item, field) - else: - value = self._extract_field(element, field) - if value is not None: - item[field['name']] = value - return item - - def _apply_transform(self, value, transform): - if transform == 'lowercase': - return value.lower() - elif transform == 'uppercase': - return value.upper() - elif transform == 'strip': - return value.strip() - return value - - def _compute_field(self, item, field): - try: - if 'expression' in field: - return eval(field['expression'], {}, item) - elif 'function' in field: - return field['function'](item) - except Exception as e: - if self.verbose: - print(f"Error computing field {field['name']}: {str(e)}") - return field.get('default') - - def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: - combined_html = self.DEL.join(sections) - return self.extract(url, combined_html, **kwargs) - -class JsonXPATHExtractionStrategy(ExtractionStrategy): - def __init__(self, schema: Dict[str, Any], **kwargs): - super().__init__(**kwargs) - self.schema = schema - self.use_cssselect = self._check_cssselect() - - def _check_cssselect(self): - try: - import cssselect - return True - except ImportError: - print("Warning: cssselect is not installed. Falling back to XPath for all selectors.") - return False - - def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: - self.soup = BeautifulSoup(html, 'lxml') - self.tree = etree.HTML(str(self.soup)) - - selector_type = 'xpath' if not self.use_cssselect else self.schema.get('selectorType', 'css') - base_selector = self.schema.get('baseXPath' if selector_type == 'xpath' else 'baseSelector') - base_elements = self._select_elements(base_selector, selector_type) - - results = [] - for element in base_elements: - item = self._extract_item(element, self.schema['fields']) - if item: - results.append(item) - - return results - - def _select_elements(self, selector, selector_type, element=None): - if selector_type == 'xpath' or not self.use_cssselect: - return self.tree.xpath(selector) if element is None else element.xpath(selector) - else: # CSS - return self.tree.cssselect(selector) if element is None else element.cssselect(selector) - - def _extract_field(self, element, field): - try: - selector_type = 'xpath' if not self.use_cssselect else field.get('selectorType', 'css') - selector = field.get('xpathSelector' if selector_type == 'xpath' else 'selector') - - if field['type'] == 'nested': - nested_element = self._select_elements(selector, selector_type, element) - return self._extract_item(nested_element[0], field['fields']) if nested_element else {} - - if field['type'] == 'list': - elements = self._select_elements(selector, selector_type, element) - return [self._extract_list_item(el, field['fields']) for el in elements] - - if field['type'] == 'nested_list': - elements = self._select_elements(selector, selector_type, element) - return [self._extract_item(el, field['fields']) for el in elements] - - return self._extract_single_field(element, field) - except Exception as e: - if self.verbose: - print(f"Error extracting field {field['name']}: {str(e)}") - return field.get('default') - - def _extract_list_item(self, element, fields): - item = {} - for field in fields: - value = self._extract_single_field(element, field) - if value is not None: - item[field['name']] = value - return item - - def _extract_single_field(self, element, field): - selector_type = field.get('selectorType', 'css') + Returns: + Any: The extracted field value. + """ if 'selector' in field: - selected = self._select_elements(field['selector'], selector_type, element) + selected = self._get_elements(element, field['selector']) if not selected: return field.get('default') selected = selected[0] @@ -822,13 +819,13 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy): value = None if field['type'] == 'text': - value = selected.text_content().strip() if hasattr(selected, 'text_content') else selected.text.strip() + value = self._get_element_text(selected) elif field['type'] == 'attribute': - value = selected.get(field['attribute']) + value = self._get_element_attribute(selected, field['attribute']) elif field['type'] == 'html': - value = etree.tostring(selected, encoding='unicode') + value = self._get_element_html(selected) elif field['type'] == 'regex': - text = selected.text_content().strip() if hasattr(selected, 'text_content') else selected.text.strip() + text = self._get_element_text(selected) match = re.search(field['pattern'], text) value = match.group(1) if match else None @@ -837,7 +834,31 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy): return value if value is not None else field.get('default') + def _extract_list_item(self, element, fields): + item = {} + for field in fields: + value = self._extract_single_field(element, field) + if value is not None: + item[field['name']] = value + return item + def _extract_item(self, element, fields): + """ + Extracts fields from a given element. + + How it works: + 1. Iterates through the fields defined in the schema. + 2. Handles computed, single, and nested field types. + 3. Updates the item dictionary with extracted field values. + + Args: + element: The base element to extract fields from. + fields (List[Dict[str, Any]]): The list of fields to extract. + + Returns: + Dict[str, Any]: A dictionary representing the extracted item. + """ + item = {} for field in fields: if field['type'] == 'computed': @@ -847,8 +868,24 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy): if value is not None: item[field['name']] = value return item - + def _apply_transform(self, value, transform): + """ + Apply a transformation to a value. + + How it works: + 1. Checks the transformation type (e.g., `lowercase`, `strip`). + 2. Applies the transformation to the value. + 3. Returns the transformed value. + + Args: + value (str): The value to transform. + transform (str): The type of transformation to apply. + + Returns: + str: The transformed value. + """ + if transform == 'lowercase': return value.lower() elif transform == 'uppercase': @@ -869,5 +906,147 @@ class JsonXPATHExtractionStrategy(ExtractionStrategy): return field.get('default') def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: + """ + Run the extraction strategy on a combined HTML content. + + How it works: + 1. Combines multiple HTML sections using the `DEL` delimiter. + 2. Calls the `extract` method with the combined HTML. + + Args: + url (str): The URL of the page being processed. + sections (List[str]): A list of HTML sections. + *q: Additional positional arguments. + **kwargs: Additional keyword arguments for custom extraction. + + Returns: + List[Dict[str, Any]]: A list of extracted items. + """ + combined_html = self.DEL.join(sections) return self.extract(url, combined_html, **kwargs) + + @abstractmethod + def _get_element_text(self, element) -> str: + """Get text content from element""" + pass + + @abstractmethod + def _get_element_html(self, element) -> str: + """Get HTML content from element""" + pass + + @abstractmethod + def _get_element_attribute(self, element, attribute: str): + """Get attribute value from element""" + pass + +class JsonCssExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. + + How it works: + 1. Parses HTML content with BeautifulSoup. + 2. Selects elements using CSS selectors defined in the schema. + 3. Extracts field data and applies transformations as defined. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into a BeautifulSoup object. + _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector. + _get_elements(element, selector): Selects child elements using a CSS selector. + _get_element_text(element): Extracts text content from a BeautifulSoup element. + _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element. + _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return BeautifulSoup(html_content, 'html.parser') + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.select(selector) + + def _get_elements(self, element, selector: str): + selected = element.select_one(selector) + return [selected] if selected else [] + + def _get_element_text(self, element) -> str: + return element.get_text(strip=True) + + def _get_element_html(self, element) -> str: + return str(element) + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + +class JsonXPathExtractionStrategy(JsonElementExtractionStrategy): + """ + Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors. + + How it works: + 1. Parses HTML content into an lxml tree. + 2. Selects elements using XPath expressions. + 3. Converts CSS selectors to XPath when needed. + + Attributes: + schema (Dict[str, Any]): The schema defining the extraction rules. + verbose (bool): Enables verbose logging for debugging purposes. + + Methods: + _parse_html(html_content): Parses HTML content into an lxml tree. + _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector. + _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression. + _get_elements(element, selector): Selects child elements using an XPath selector. + _get_element_text(element): Extracts text content from an lxml element. + _get_element_html(element): Extracts the raw HTML content of an lxml element. + _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element. + """ + + def __init__(self, schema: Dict[str, Any], **kwargs): + kwargs['input_format'] = 'html' # Force HTML input + super().__init__(schema, **kwargs) + + def _parse_html(self, html_content: str): + return html.fromstring(html_content) + + def _get_base_elements(self, parsed_html, selector: str): + return parsed_html.xpath(selector) + + def _css_to_xpath(self, css_selector: str) -> str: + """Convert CSS selector to XPath if needed""" + if '/' in css_selector: # Already an XPath + return css_selector + return self._basic_css_to_xpath(css_selector) + + def _basic_css_to_xpath(self, css_selector: str) -> str: + """Basic CSS to XPath conversion for common cases""" + if ' > ' in css_selector: + parts = css_selector.split(' > ') + return '//' + '/'.join(parts) + if ' ' in css_selector: + parts = css_selector.split(' ') + return '//' + '//'.join(parts) + return '//' + css_selector + + def _get_elements(self, element, selector: str): + xpath = self._css_to_xpath(selector) + if not xpath.startswith('.'): + xpath = '.' + xpath + return element.xpath(xpath) + + def _get_element_text(self, element) -> str: + return ''.join(element.xpath('.//text()')).strip() + + def _get_element_html(self, element) -> str: + return etree.tostring(element, encoding='unicode') + + def _get_element_attribute(self, element, attribute: str): + return element.get(attribute) + diff --git a/crawl4ai/install.py b/crawl4ai/install.py index 71fe30ea..4a3f5d45 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -2,6 +2,7 @@ import subprocess import sys import asyncio from .async_logger import AsyncLogger, LogLevel +from .docs_manager import DocsManager # Initialize logger logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) @@ -11,6 +12,7 @@ def post_install(): logger.info("Running post-installation setup...", tag="INIT") install_playwright() run_migration() + asyncio.run(setup_docs()) logger.success("Post-installation setup completed!", tag="COMPLETE") def install_playwright(): @@ -41,4 +43,9 @@ def run_migration(): logger.warning("Database module not found. Will initialize on first use.") except Exception as e: logger.warning(f"Database initialization failed: {e}") - logger.warning("Database will be initialized on first use") \ No newline at end of file + logger.warning("Database will be initialized on first use") + +async def setup_docs(): + """Download documentation files""" + docs_manager = DocsManager(logger) + await docs_manager.update_docs() \ No newline at end of file diff --git a/crawl4ai/llmtxt.py b/crawl4ai/llmtxt.py new file mode 100644 index 00000000..94efe076 --- /dev/null +++ b/crawl4ai/llmtxt.py @@ -0,0 +1,498 @@ +import os +from pathlib import Path +import re +from typing import Dict, List, Tuple, Optional, Any +import json +from tqdm import tqdm +import time +import psutil +import numpy as np +from rank_bm25 import BM25Okapi +from nltk.tokenize import word_tokenize +from nltk.corpus import stopwords +from nltk.stem import WordNetLemmatizer +from litellm import completion, batch_completion +from .async_logger import AsyncLogger +import litellm +import pickle +import hashlib # <--- ADDED for file-hash +from fnmatch import fnmatch +import glob + +litellm.set_verbose = False + +def _compute_file_hash(file_path: Path) -> str: + """Compute MD5 hash for the file's entire content.""" + hash_md5 = hashlib.md5() + with file_path.open("rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + +class AsyncLLMTextManager: + def __init__( + self, + docs_dir: Path, + logger: Optional[AsyncLogger] = None, + max_concurrent_calls: int = 5, + batch_size: int = 3 + ) -> None: + self.docs_dir = docs_dir + self.logger = logger + self.max_concurrent_calls = max_concurrent_calls + self.batch_size = batch_size + self.bm25_index = None + self.document_map: Dict[str, Any] = {} + self.tokenized_facts: List[str] = [] + self.bm25_index_file = self.docs_dir / "bm25_index.pkl" + + async def _process_document_batch(self, doc_batch: List[Path]) -> None: + """Process a batch of documents in parallel""" + contents = [] + for file_path in doc_batch: + try: + with open(file_path, 'r', encoding='utf-8') as f: + contents.append(f.read()) + except Exception as e: + self.logger.error(f"Error reading {file_path}: {str(e)}") + contents.append("") # Add empty content to maintain batch alignment + + prompt = """Given a documentation file, generate a list of atomic facts where each fact: +1. Represents a single piece of knowledge +2. Contains variations in terminology for the same concept +3. References relevant code patterns if they exist +4. Is written in a way that would match natural language queries + +Each fact should follow this format: +: | | + +Example Facts: +browser_config: Configure headless mode and browser type for AsyncWebCrawler | headless, browser_type, chromium, firefox | BrowserConfig(browser_type="chromium", headless=True) +redis_connection: Redis client connection requires host and port configuration | redis setup, redis client, connection params | Redis(host='localhost', port=6379, db=0) +pandas_filtering: Filter DataFrame rows using boolean conditions | dataframe filter, query, boolean indexing | df[df['column'] > 5] + +Wrap your response in ... tags. +""" + + # Prepare messages for batch processing + messages_list = [ + [ + {"role": "user", "content": f"{prompt}\n\nGenerate index for this documentation:\n\n{content}"} + ] + for content in contents if content + ] + + try: + responses = batch_completion( + model="anthropic/claude-3-5-sonnet-latest", + messages=messages_list, + logger_fn=None + ) + + # Process responses and save index files + for response, file_path in zip(responses, doc_batch): + try: + index_content_match = re.search( + r'(.*?)', + response.choices[0].message.content, + re.DOTALL + ) + if not index_content_match: + self.logger.warning(f"No ... content found for {file_path}") + continue + + index_content = re.sub( + r"\n\s*\n", "\n", index_content_match.group(1) + ).strip() + if index_content: + index_file = file_path.with_suffix('.q.md') + with open(index_file, 'w', encoding='utf-8') as f: + f.write(index_content) + self.logger.info(f"Created index file: {index_file}") + else: + self.logger.warning(f"No index content found in response for {file_path}") + + except Exception as e: + self.logger.error(f"Error processing response for {file_path}: {str(e)}") + + except Exception as e: + self.logger.error(f"Error in batch completion: {str(e)}") + + def _validate_fact_line(self, line: str) -> Tuple[bool, Optional[str]]: + if "|" not in line: + return False, "Missing separator '|'" + + parts = [p.strip() for p in line.split("|")] + if len(parts) != 3: + return False, f"Expected 3 parts, got {len(parts)}" + + concept_part = parts[0] + if ":" not in concept_part: + return False, "Missing ':' in concept definition" + + return True, None + + def _load_or_create_token_cache(self, fact_file: Path) -> Dict: + """ + Load token cache from .q.tokens if present and matching file hash. + Otherwise return a new structure with updated file-hash. + """ + cache_file = fact_file.with_suffix(".q.tokens") + current_hash = _compute_file_hash(fact_file) + + if cache_file.exists(): + try: + with open(cache_file, "r") as f: + cache = json.load(f) + # If the hash matches, return it directly + if cache.get("content_hash") == current_hash: + return cache + # Otherwise, we signal that it's changed + self.logger.info(f"Hash changed for {fact_file}, reindex needed.") + except json.JSONDecodeError: + self.logger.warning(f"Corrupt token cache for {fact_file}, rebuilding.") + except Exception as e: + self.logger.warning(f"Error reading cache for {fact_file}: {str(e)}") + + # Return a fresh cache + return {"facts": {}, "content_hash": current_hash} + + def _save_token_cache(self, fact_file: Path, cache: Dict) -> None: + cache_file = fact_file.with_suffix(".q.tokens") + # Always ensure we're saving the correct file-hash + cache["content_hash"] = _compute_file_hash(fact_file) + with open(cache_file, "w") as f: + json.dump(cache, f) + + def preprocess_text(self, text: str) -> List[str]: + parts = [x.strip() for x in text.split("|")] if "|" in text else [text] + # Remove : after the first word of parts[0] + parts[0] = re.sub(r"^(.*?):", r"\1", parts[0]) + + lemmatizer = WordNetLemmatizer() + stop_words = set(stopwords.words("english")) - { + "how", "what", "when", "where", "why", "which", + } + + tokens = [] + for part in parts: + if "(" in part and ")" in part: + code_tokens = re.findall( + r'[\w_]+(?=\()|[\w_]+(?==[\'"]{1}[\w_]+[\'"]{1})', part + ) + tokens.extend(code_tokens) + + words = word_tokenize(part.lower()) + tokens.extend( + [ + lemmatizer.lemmatize(token) + for token in words + if token not in stop_words + ] + ) + + return tokens + + def maybe_load_bm25_index(self, clear_cache=False) -> bool: + """ + Load existing BM25 index from disk, if present and clear_cache=False. + """ + if not clear_cache and os.path.exists(self.bm25_index_file): + self.logger.info("Loading existing BM25 index from disk.") + with open(self.bm25_index_file, "rb") as f: + data = pickle.load(f) + self.tokenized_facts = data["tokenized_facts"] + self.bm25_index = data["bm25_index"] + return True + return False + + def build_search_index(self, clear_cache=False) -> None: + """ + Checks for new or modified .q.md files by comparing file-hash. + If none need reindexing and clear_cache is False, loads existing index if available. + Otherwise, reindexes only changed/new files and merges or creates a new index. + """ + # If clear_cache is True, we skip partial logic: rebuild everything from scratch + if clear_cache: + self.logger.info("Clearing cache and rebuilding full search index.") + if self.bm25_index_file.exists(): + self.bm25_index_file.unlink() + + process = psutil.Process() + self.logger.info("Checking which .q.md files need (re)indexing...") + + # Gather all .q.md files + q_files = [self.docs_dir / f for f in os.listdir(self.docs_dir) if f.endswith(".q.md")] + + # We'll store known (unchanged) facts in these lists + existing_facts: List[str] = [] + existing_tokens: List[List[str]] = [] + + # Keep track of invalid lines for logging + invalid_lines = [] + needSet = [] # files that must be (re)indexed + + for qf in q_files: + token_cache_file = qf.with_suffix(".q.tokens") + + # If no .q.tokens or clear_cache is True → definitely reindex + if clear_cache or not token_cache_file.exists(): + needSet.append(qf) + continue + + # Otherwise, load the existing cache and compare hash + cache = self._load_or_create_token_cache(qf) + # If the .q.tokens was out of date (i.e. changed hash), we reindex + if len(cache["facts"]) == 0 or cache.get("content_hash") != _compute_file_hash(qf): + needSet.append(qf) + else: + # File is unchanged → retrieve cached token data + for line, cache_data in cache["facts"].items(): + existing_facts.append(line) + existing_tokens.append(cache_data["tokens"]) + self.document_map[line] = qf # track the doc for that fact + + if not needSet and not clear_cache: + # If no file needs reindexing, try loading existing index + if self.maybe_load_bm25_index(clear_cache=False): + self.logger.info("No new/changed .q.md files found. Using existing BM25 index.") + return + else: + # If there's no existing index, we must build a fresh index from the old caches + self.logger.info("No existing BM25 index found. Building from cached facts.") + if existing_facts: + self.logger.info(f"Building BM25 index with {len(existing_facts)} cached facts.") + self.bm25_index = BM25Okapi(existing_tokens) + self.tokenized_facts = existing_facts + with open(self.bm25_index_file, "wb") as f: + pickle.dump({ + "bm25_index": self.bm25_index, + "tokenized_facts": self.tokenized_facts + }, f) + else: + self.logger.warning("No facts found at all. Index remains empty.") + return + + # ----------------------------------------------------- /Users/unclecode/.crawl4ai/docs/14_proxy_security.q.q.tokens '/Users/unclecode/.crawl4ai/docs/14_proxy_security.q.md' + # If we reach here, we have new or changed .q.md files + # We'll parse them, reindex them, and then combine with existing_facts + # ----------------------------------------------------- + + self.logger.info(f"{len(needSet)} file(s) need reindexing. Parsing now...") + + # 1) Parse the new or changed .q.md files + new_facts = [] + new_tokens = [] + with tqdm(total=len(needSet), desc="Indexing changed files") as file_pbar: + for file in needSet: + # We'll build up a fresh cache + fresh_cache = {"facts": {}, "content_hash": _compute_file_hash(file)} + try: + with open(file, "r", encoding="utf-8") as f_obj: + content = f_obj.read().strip() + lines = [l.strip() for l in content.split("\n") if l.strip()] + + for line in lines: + is_valid, error = self._validate_fact_line(line) + if not is_valid: + invalid_lines.append((file, line, error)) + continue + + tokens = self.preprocess_text(line) + fresh_cache["facts"][line] = { + "tokens": tokens, + "added": time.time(), + } + new_facts.append(line) + new_tokens.append(tokens) + self.document_map[line] = file + + # Save the new .q.tokens with updated hash + self._save_token_cache(file, fresh_cache) + + mem_usage = process.memory_info().rss / 1024 / 1024 + self.logger.debug(f"Memory usage after {file.name}: {mem_usage:.2f}MB") + + except Exception as e: + self.logger.error(f"Error processing {file}: {str(e)}") + + file_pbar.update(1) + + if invalid_lines: + self.logger.warning(f"Found {len(invalid_lines)} invalid fact lines:") + for file, line, error in invalid_lines: + self.logger.warning(f"{file}: {error} in line: {line[:50]}...") + + # 2) Merge newly tokenized facts with the existing ones + all_facts = existing_facts + new_facts + all_tokens = existing_tokens + new_tokens + + # 3) Build BM25 index from combined facts + self.logger.info(f"Building BM25 index with {len(all_facts)} total facts (old + new).") + self.bm25_index = BM25Okapi(all_tokens) + self.tokenized_facts = all_facts + + # 4) Save the updated BM25 index to disk + with open(self.bm25_index_file, "wb") as f: + pickle.dump({ + "bm25_index": self.bm25_index, + "tokenized_facts": self.tokenized_facts + }, f) + + final_mem = process.memory_info().rss / 1024 / 1024 + self.logger.info(f"Search index updated. Final memory usage: {final_mem:.2f}MB") + + async def generate_index_files(self, force_generate_facts: bool = False, clear_bm25_cache: bool = False) -> None: + """ + Generate index files for all documents in parallel batches + + Args: + force_generate_facts (bool): If True, regenerate indexes even if they exist + clear_bm25_cache (bool): If True, clear existing BM25 index cache + """ + self.logger.info("Starting index generation for documentation files.") + + md_files = [ + self.docs_dir / f for f in os.listdir(self.docs_dir) + if f.endswith('.md') and not any(f.endswith(x) for x in ['.q.md', '.xs.md']) + ] + + # Filter out files that already have .q files unless force=True + if not force_generate_facts: + md_files = [ + f for f in md_files + if not (self.docs_dir / f.name.replace('.md', '.q.md')).exists() + ] + + if not md_files: + self.logger.info("All index files exist. Use force=True to regenerate.") + else: + # Process documents in batches + for i in range(0, len(md_files), self.batch_size): + batch = md_files[i:i + self.batch_size] + self.logger.info(f"Processing batch {i//self.batch_size + 1}/{(len(md_files)//self.batch_size) + 1}") + await self._process_document_batch(batch) + + self.logger.info("Index generation complete, building/updating search index.") + self.build_search_index(clear_cache=clear_bm25_cache) + + def generate(self, sections: List[str], mode: str = "extended") -> str: + # Get all markdown files + all_files = glob.glob(str(self.docs_dir / "[0-9]*.md")) + \ + glob.glob(str(self.docs_dir / "[0-9]*.xs.md")) + + # Extract base names without extensions + base_docs = {Path(f).name.split('.')[0] for f in all_files + if not Path(f).name.endswith('.q.md')} + + # Filter by sections if provided + if sections: + base_docs = {doc for doc in base_docs + if any(section.lower() in doc.lower() for section in sections)} + + # Get file paths based on mode + files = [] + for doc in sorted(base_docs, key=lambda x: int(x.split('_')[0]) if x.split('_')[0].isdigit() else 999999): + if mode == "condensed": + xs_file = self.docs_dir / f"{doc}.xs.md" + regular_file = self.docs_dir / f"{doc}.md" + files.append(str(xs_file if xs_file.exists() else regular_file)) + else: + files.append(str(self.docs_dir / f"{doc}.md")) + + # Read and format content + content = [] + for file in files: + try: + with open(file, 'r', encoding='utf-8') as f: + fname = Path(file).name + content.append(f"{'#'*20}\n# {fname}\n{'#'*20}\n\n{f.read()}") + except Exception as e: + self.logger.error(f"Error reading {file}: {str(e)}") + + return "\n\n---\n\n".join(content) if content else "" + + def search(self, query: str, top_k: int = 5) -> str: + if not self.bm25_index: + return "No search index available. Call build_search_index() first." + + query_tokens = self.preprocess_text(query) + doc_scores = self.bm25_index.get_scores(query_tokens) + + mean_score = np.mean(doc_scores) + std_score = np.std(doc_scores) + score_threshold = mean_score + (0.25 * std_score) + + file_data = self._aggregate_search_scores( + doc_scores=doc_scores, + score_threshold=score_threshold, + query_tokens=query_tokens, + ) + + ranked_files = sorted( + file_data.items(), + key=lambda x: ( + x[1]["code_match_score"] * 2.0 + + x[1]["match_count"] * 1.5 + + x[1]["total_score"] + ), + reverse=True, + )[:top_k] + + results = [] + for file, _ in ranked_files: + main_doc = str(file).replace(".q.md", ".md") + if os.path.exists(self.docs_dir / main_doc): + with open(self.docs_dir / main_doc, "r", encoding='utf-8') as f: + only_file_name = main_doc.split("/")[-1] + content = [ + "#" * 20, + f"# {only_file_name}", + "#" * 20, + "", + f.read() + ] + results.append("\n".join(content)) + + return "\n\n---\n\n".join(results) + + def _aggregate_search_scores( + self, doc_scores: List[float], score_threshold: float, query_tokens: List[str] + ) -> Dict: + file_data = {} + + for idx, score in enumerate(doc_scores): + if score <= score_threshold: + continue + + fact = self.tokenized_facts[idx] + file_path = self.document_map[fact] + + if file_path not in file_data: + file_data[file_path] = { + "total_score": 0, + "match_count": 0, + "code_match_score": 0, + "matched_facts": [], + } + + components = fact.split("|") if "|" in fact else [fact] + + code_match_score = 0 + if len(components) == 3: + code_ref = components[2].strip() + code_tokens = self.preprocess_text(code_ref) + code_match_score = len(set(query_tokens) & set(code_tokens)) / len(query_tokens) + + file_data[file_path]["total_score"] += score + file_data[file_path]["match_count"] += 1 + file_data[file_path]["code_match_score"] = max( + file_data[file_path]["code_match_score"], code_match_score + ) + file_data[file_path]["matched_facts"].append(fact) + + return file_data + + def refresh_index(self) -> None: + """Convenience method for a full rebuild.""" + self.build_search_index(clear_cache=True) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index b9e4b0c6..474dc9e8 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -38,11 +38,44 @@ class MarkdownGenerationStrategy(ABC): pass class DefaultMarkdownGenerator(MarkdownGenerationStrategy): - """Default implementation of markdown generation strategy.""" + """ + Default implementation of markdown generation strategy. + + How it works: + 1. Generate raw markdown from cleaned HTML. + 2. Convert links to citations. + 3. Generate fit markdown if content filter is provided. + 4. Return MarkdownGenerationResult. + + Args: + content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. + options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. + + Returns: + MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. + """ def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): super().__init__(content_filter, options) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: + """ + Convert links in markdown to citations. + + How it works: + 1. Find all links in the markdown. + 2. Convert links to citations. + 3. Return converted markdown and references markdown. + + Note: + This function uses a regex pattern to find links in markdown. + + Args: + markdown (str): Markdown text. + base_url (str): Base URL for URL joins. + + Returns: + Tuple[str, str]: Converted markdown and references markdown. + """ link_map = {} url_cache = {} # Cache for URL joins parts = [] @@ -90,7 +123,26 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult: - """Generate markdown with citations from cleaned HTML.""" + """ + Generate markdown with citations from cleaned HTML. + + How it works: + 1. Generate raw markdown from cleaned HTML. + 2. Convert links to citations. + 3. Generate fit markdown if content filter is provided. + 4. Return MarkdownGenerationResult. + + Args: + cleaned_html (str): Cleaned HTML content. + base_url (str): Base URL for URL joins. + html2text_options (Optional[Dict[str, Any]]): HTML2Text options. + options (Optional[Dict[str, Any]]): Additional options for markdown generation. + content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. + citations (bool): Whether to generate citations. + + Returns: + MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. + """ # Initialize HTML2Text with options h = CustomHTML2Text() if html2text_options: diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 315069fb..6fb362a3 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,7 +1,16 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional, Callable, Awaitable, Union - +from typing import List, Dict, Optional, Callable, Awaitable, Union, Any +from dataclasses import dataclass +from .ssl_certificate import SSLCertificate +@dataclass +class TokenUsage: + completion_tokens: int = 0 + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens_details: Optional[dict] = None + prompt_tokens_details: Optional[dict] = None + class UrlModel(BaseModel): url: HttpUrl @@ -34,7 +43,10 @@ class CrawlResult(BaseModel): session_id: Optional[str] = None response_headers: Optional[dict] = None status_code: Optional[int] = None - + ssl_certificate: Optional[SSLCertificate] = None + class Config: + arbitrary_types_allowed = True + class AsyncCrawlResponse(BaseModel): html: str response_headers: Dict[str, str] @@ -43,8 +55,7 @@ class AsyncCrawlResponse(BaseModel): pdf_data: Optional[bytes] = None get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None downloaded_files: Optional[List[str]] = None + ssl_certificate: Optional[SSLCertificate] = None class Config: arbitrary_types_allowed = True - - diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py new file mode 100644 index 00000000..97529e3e --- /dev/null +++ b/crawl4ai/ssl_certificate.py @@ -0,0 +1,181 @@ +"""SSL Certificate class for handling certificate operations.""" + +import ssl +import socket +import base64 +import json +from typing import Dict, Any, Optional +from urllib.parse import urlparse +import OpenSSL.crypto +from pathlib import Path + + +class SSLCertificate: + """ + A class representing an SSL certificate with methods to export in various formats. + + Attributes: + cert_info (Dict[str, Any]): The certificate information. + + Methods: + from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL. + from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file. + from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data. + export_as_pem() -> str: Export the certificate as PEM format. + export_as_der() -> bytes: Export the certificate as DER format. + export_as_json() -> Dict[str, Any]: Export the certificate as JSON format. + export_as_text() -> str: Export the certificate as text format. + """ + def __init__(self, cert_info: Dict[str, Any]): + self._cert_info = self._decode_cert_data(cert_info) + + @staticmethod + def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: + """ + Create SSLCertificate instance from a URL. + + Args: + url (str): URL of the website. + timeout (int): Timeout for the connection (default: 10). + + Returns: + Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise. + """ + try: + hostname = urlparse(url).netloc + if ':' in hostname: + hostname = hostname.split(':')[0] + + context = ssl.create_default_context() + with socket.create_connection((hostname, 443), timeout=timeout) as sock: + with context.wrap_socket(sock, server_hostname=hostname) as ssock: + cert_binary = ssock.getpeercert(binary_form=True) + x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary) + + cert_info = { + "subject": dict(x509.get_subject().get_components()), + "issuer": dict(x509.get_issuer().get_components()), + "version": x509.get_version(), + "serial_number": hex(x509.get_serial_number()), + "not_before": x509.get_notBefore(), + "not_after": x509.get_notAfter(), + "fingerprint": x509.digest("sha256").hex(), + "signature_algorithm": x509.get_signature_algorithm(), + "raw_cert": base64.b64encode(cert_binary) + } + + # Add extensions + extensions = [] + for i in range(x509.get_extension_count()): + ext = x509.get_extension(i) + extensions.append({ + "name": ext.get_short_name(), + "value": str(ext) + }) + cert_info["extensions"] = extensions + + return SSLCertificate(cert_info) + + except Exception as e: + return None + + @staticmethod + def _decode_cert_data(data: Any) -> Any: + """Helper method to decode bytes in certificate data.""" + if isinstance(data, bytes): + return data.decode('utf-8') + elif isinstance(data, dict): + return { + (k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v) + for k, v in data.items() + } + elif isinstance(data, list): + return [SSLCertificate._decode_cert_data(item) for item in data] + return data + + def to_json(self, filepath: Optional[str] = None) -> Optional[str]: + """ + Export certificate as JSON. + + Args: + filepath (Optional[str]): Path to save the JSON file (default: None). + + Returns: + Optional[str]: JSON string if successful, None otherwise. + """ + json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False) + if filepath: + Path(filepath).write_text(json_str, encoding='utf-8') + return None + return json_str + + def to_pem(self, filepath: Optional[str] = None) -> Optional[str]: + """ + Export certificate as PEM. + + Args: + filepath (Optional[str]): Path to save the PEM file (default: None). + + Returns: + Optional[str]: PEM string if successful, None otherwise. + """ + try: + x509 = OpenSSL.crypto.load_certificate( + OpenSSL.crypto.FILETYPE_ASN1, + base64.b64decode(self._cert_info['raw_cert']) + ) + pem_data = OpenSSL.crypto.dump_certificate( + OpenSSL.crypto.FILETYPE_PEM, + x509 + ).decode('utf-8') + + if filepath: + Path(filepath).write_text(pem_data, encoding='utf-8') + return None + return pem_data + except Exception as e: + return None + + def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]: + """ + Export certificate as DER. + + Args: + filepath (Optional[str]): Path to save the DER file (default: None). + + Returns: + Optional[bytes]: DER bytes if successful, None otherwise. + """ + try: + der_data = base64.b64decode(self._cert_info['raw_cert']) + if filepath: + Path(filepath).write_bytes(der_data) + return None + return der_data + except Exception: + return None + + @property + def issuer(self) -> Dict[str, str]: + """Get certificate issuer information.""" + return self._cert_info.get('issuer', {}) + + @property + def subject(self) -> Dict[str, str]: + """Get certificate subject information.""" + return self._cert_info.get('subject', {}) + + @property + def valid_from(self) -> str: + """Get certificate validity start date.""" + return self._cert_info.get('not_before', '') + + @property + def valid_until(self) -> str: + """Get certificate validity end date.""" + return self._cert_info.get('not_after', '') + + @property + def fingerprint(self) -> str: + """Get certificate fingerprint.""" + return self._cert_info.get('fingerprint', '') diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index a1f3a49e..6679bb1b 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -4,6 +4,34 @@ import re class UserAgentGenerator: + """ + Generate random user agents with specified constraints. + + Attributes: + desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings. + mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings. + browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings. + rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings. + chrome_versions (list): A list of possible Chrome browser versions. + firefox_versions (list): A list of possible Firefox browser versions. + edge_versions (list): A list of possible Edge browser versions. + safari_versions (list): A list of possible Safari browser versions. + ios_versions (list): A list of possible iOS browser versions. + android_versions (list): A list of possible Android browser versions. + + Methods: + generate_user_agent( + platform: Literal["desktop", "mobile"] = "desktop", + browser: str = "chrome", + rendering_engine: str = "chrome_webkit", + chrome_version: Optional[str] = None, + firefox_version: Optional[str] = None, + edge_version: Optional[str] = None, + safari_version: Optional[str] = None, + ios_version: Optional[str] = None, + android_version: Optional[str] = None + ): Generates a random user agent string based on the specified parameters. + """ def __init__(self): # Previous platform definitions remain the same... self.desktop_platforms = { @@ -105,7 +133,21 @@ class UserAgentGenerator: ] def get_browser_stack(self, num_browsers: int = 1) -> List[str]: - """Get a valid combination of browser versions""" + """ + Get a valid combination of browser versions. + + How it works: + 1. Check if the number of browsers is supported. + 2. Randomly choose a combination of browsers. + 3. Iterate through the combination and add browser versions. + 4. Return the browser stack. + + Args: + num_browsers: Number of browser specifications (1-3) + + Returns: + List[str]: A list of browser versions. + """ if num_browsers not in self.browser_combinations: raise ValueError(f"Unsupported number of browsers: {num_browsers}") diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 7ecc22da..214ebbc6 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1,4 +1,5 @@ import time +from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString import json @@ -6,7 +7,6 @@ import html import re import os import platform -from .html2text import HTML2Text from .prompts import PROMPT_EXTRACT_BLOCKS from .config import * from pathlib import Path @@ -14,7 +14,6 @@ from typing import Dict, Any from urllib.parse import urljoin import requests from requests.exceptions import InvalidSchema -import hashlib from typing import Optional, Tuple, Dict, Any import xxhash from colorama import Fore, Style, init @@ -26,64 +25,91 @@ from functools import wraps class InvalidCSSSelectorError(Exception): pass -def create_box_message( - message: str, - type: str = "info", - width: int = 120, - add_newlines: bool = True, - double_line: bool = False -) -> str: - init() - - # Define border and text colors for different types - styles = { - "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), - "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), - "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), - "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), - } - - border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) - - # Define box characters based on line style - box_chars = { - "single": ("─", "│", "┌", "┐", "└", "┘"), - "double": ("═", "║", "╔", "╗", "╚", "╝") - } - line_style = "double" if double_line else "single" - h_line, v_line, tl, tr, bl, br = box_chars[line_style] - - # Process lines with lighter text color - formatted_lines = [] - raw_lines = message.split('\n') - - if raw_lines: - first_line = f"{prefix} {raw_lines[0].strip()}" - wrapped_first = textwrap.fill(first_line, width=width-4) - formatted_lines.extend(wrapped_first.split('\n')) - - for line in raw_lines[1:]: - if line.strip(): - wrapped = textwrap.fill(f" {line.strip()}", width=width-4) - formatted_lines.extend(wrapped.split('\n')) - else: - formatted_lines.append("") - - # Create the box with colored borders and lighter text - horizontal_line = h_line * (width - 1) - box = [ - f"{border_color}{tl}{horizontal_line}{tr}", - *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], - f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" - ] - - result = "\n".join(box) - if add_newlines: - result = f"\n{result}\n" - - return result +def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str: + """ + Create a styled message box with colored borders and formatted text. + + How it works: + 1. Determines box style and colors based on the message type (e.g., info, warning). + 2. Wraps text to fit within the specified width. + 3. Constructs a box using characters (single or double lines) with appropriate formatting. + 4. Adds optional newlines before and after the box. + + Args: + message (str): The message to display inside the box. + type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info". + width (int): Width of the box. Defaults to 120. + add_newlines (bool): Whether to add newlines before and after the box. Defaults to True. + double_line (bool): Whether to use double lines for the box border. Defaults to False. + + Returns: + str: A formatted string containing the styled message box. + """ + + init() + + # Define border and text colors for different types + styles = { + "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), + "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), + "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), + "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), + } + + border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) + + # Define box characters based on line style + box_chars = { + "single": ("─", "│", "┌", "┐", "└", "┘"), + "double": ("═", "║", "╔", "╗", "╚", "╝") + } + line_style = "double" if double_line else "single" + h_line, v_line, tl, tr, bl, br = box_chars[line_style] + + # Process lines with lighter text color + formatted_lines = [] + raw_lines = message.split('\n') + + if raw_lines: + first_line = f"{prefix} {raw_lines[0].strip()}" + wrapped_first = textwrap.fill(first_line, width=width-4) + formatted_lines.extend(wrapped_first.split('\n')) + + for line in raw_lines[1:]: + if line.strip(): + wrapped = textwrap.fill(f" {line.strip()}", width=width-4) + formatted_lines.extend(wrapped.split('\n')) + else: + formatted_lines.append("") + + # Create the box with colored borders and lighter text + horizontal_line = h_line * (width - 1) + box = [ + f"{border_color}{tl}{horizontal_line}{tr}", + *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], + f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" + ] + + result = "\n".join(box) + if add_newlines: + result = f"\n{result}\n" + + return result def calculate_semaphore_count(): + """ + Calculate the optimal semaphore count based on system resources. + + How it works: + 1. Determines the number of CPU cores and total system memory. + 2. Sets a base count as half of the available CPU cores. + 3. Limits the count based on memory, assuming 2GB per semaphore instance. + 4. Returns the minimum value between CPU and memory-based limits. + + Returns: + int: The calculated semaphore count. + """ + cpu_count = os.cpu_count() memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB base_count = max(1, cpu_count // 2) @@ -91,6 +117,21 @@ def calculate_semaphore_count(): return min(base_count, memory_based_cap) def get_system_memory(): + """ + Get the total system memory in bytes. + + How it works: + 1. Detects the operating system. + 2. Reads memory information from system-specific commands or files. + 3. Converts the memory to bytes for uniformity. + + Returns: + int: The total system memory in bytes. + + Raises: + OSError: If the operating system is unsupported. + """ + system = platform.system() if system == "Linux": with open('/proc/meminfo', 'r') as mem: @@ -125,6 +166,18 @@ def get_system_memory(): raise OSError("Unsupported operating system") def get_home_folder(): + """ + Get or create the home folder for Crawl4AI configuration and cache. + + How it works: + 1. Uses environment variables or defaults to the user's home directory. + 2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist. + 3. Returns the path to the home folder. + + Returns: + str: The path to the Crawl4AI home folder. + """ + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) @@ -195,6 +248,20 @@ def split_and_parse_json_objects(json_string): return parsed_objects, unparsed_segments def sanitize_html(html): + """ + Sanitize an HTML string by escaping quotes. + + How it works: + 1. Replaces all unwanted and special characters with an empty string. + 2. Escapes double and single quotes for safe usage. + + Args: + html (str): The HTML string to sanitize. + + Returns: + str: The sanitized HTML string. + """ + # Replace all unwanted and special characters with an empty string sanitized_html = html # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) @@ -249,6 +316,23 @@ def escape_json_string(s): return s def replace_inline_tags(soup, tags, only_text=False): + """ + Replace inline HTML tags with Markdown-style equivalents. + + How it works: + 1. Maps specific tags (e.g., , ) to Markdown syntax. + 2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object. + 3. Optionally replaces tags with their text content only. + + Args: + soup (BeautifulSoup): Parsed HTML content. + tags (List[str]): List of tags to replace. + only_text (bool): Whether to replace tags with plain text. Defaults to False. + + Returns: + BeautifulSoup: Updated BeautifulSoup object with replaced tags. + """ + tag_replacements = { 'b': lambda tag: f"**{tag.text}**", 'i': lambda tag: f"*{tag.text}*", @@ -293,6 +377,26 @@ def replace_inline_tags(soup, tags, only_text=False): # return soup def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs): + """ + Extract structured content, media, and links from website HTML. + + How it works: + 1. Parses the HTML content using BeautifulSoup. + 2. Extracts internal/external links and media (images, videos, audios). + 3. Cleans the content by removing unwanted tags and attributes. + 4. Converts cleaned HTML to Markdown. + 5. Collects metadata and returns the extracted information. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD. + css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None. + + Returns: + Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata. + """ + try: if not html: return None @@ -763,6 +867,27 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: } def extract_metadata(html, soup=None): + """ + Extract optimized content, media, and links from website HTML. + + How it works: + 1. Similar to `get_content_of_website`, but optimized for performance. + 2. Filters and scores images for usefulness. + 3. Extracts contextual descriptions for media files. + 4. Handles excluded tags and CSS selectors. + 5. Cleans HTML and converts it to Markdown. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD. + css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None. + **kwargs: Additional options for customization. + + Returns: + Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata. + """ + metadata = {} if not html and not soup: @@ -810,10 +935,35 @@ def extract_metadata(html, soup=None): return metadata def extract_xml_tags(string): + """ + Extracts XML tags from a string. + + Args: + string (str): The input string containing XML tags. + + Returns: + List[str]: A list of XML tags extracted from the input string. + """ tags = re.findall(r'<(\w+)>', string) return list(set(tags)) def extract_xml_data(tags, string): + """ + Extract data for specified XML tags from a string. + + How it works: + 1. Searches the string for each tag using regex. + 2. Extracts the content within the tags. + 3. Returns a dictionary of tag-content pairs. + + Args: + tags (List[str]): The list of XML tags to extract. + string (str): The input string containing XML data. + + Returns: + Dict[str, str]: A dictionary with tag names as keys and extracted content as values. + """ + data = {} for tag in tags: @@ -834,6 +984,26 @@ def perform_completion_with_backoff( base_url=None, **kwargs ): + """ + Perform an API completion request with exponential backoff. + + How it works: + 1. Sends a completion request to the API. + 2. Retries on rate-limit errors with exponential delays. + 3. Returns the API response or an error after all retries. + + Args: + provider (str): The name of the API provider. + prompt_with_variables (str): The input prompt for the completion request. + api_token (str): The API token for authentication. + json_response (bool): Whether to request a JSON response. Defaults to False. + base_url (Optional[str]): The base URL for the API. Defaults to None. + **kwargs: Additional arguments for the API request. + + Returns: + dict: The API response or an error message after all retries. + """ + from litellm import completion from litellm.exceptions import RateLimitError max_attempts = 3 @@ -879,6 +1049,25 @@ def perform_completion_with_backoff( }] def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None): + """ + Extract content blocks from website HTML using an AI provider. + + How it works: + 1. Prepares a prompt by sanitizing and escaping HTML. + 2. Sends the prompt to an AI provider with optional retries. + 3. Parses the response to extract structured blocks or errors. + + Args: + url (str): The website URL. + html (str): The HTML content of the website. + provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER. + api_token (Optional[str]): The API token for authentication. Defaults to None. + base_url (Optional[str]): The base URL for the API. Defaults to None. + + Returns: + List[dict]: A list of extracted content blocks. + """ + # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token @@ -915,6 +1104,23 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas return blocks def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None): + """ + Extract content blocks from a batch of website HTMLs. + + How it works: + 1. Prepares prompts for each URL and HTML pair. + 2. Sends the prompts to the AI provider in a batch request. + 3. Parses the responses to extract structured blocks or errors. + + Args: + batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs. + provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192". + api_token (Optional[str]): The API token for authentication. Defaults to None. + + Returns: + List[dict]: A list of extracted content blocks from all batch items. + """ + api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token from litellm import batch_completion messages = [] @@ -987,6 +1193,25 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold): return merged_sections def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list: + """ + Process sections of HTML content sequentially or in parallel. + + How it works: + 1. Sequentially processes sections with delays for "groq/" providers. + 2. Uses ThreadPoolExecutor for parallel processing with other providers. + 3. Extracts content blocks for each section. + + Args: + url (str): The website URL. + sections (List[str]): The list of HTML sections to process. + provider (str): The AI provider for content extraction. + api_token (str): The API token for authentication. + base_url (Optional[str]): The base URL for the API. Defaults to None. + + Returns: + List[dict]: The list of extracted content blocks from all sections. + """ + extracted_content = [] if provider.startswith("groq/"): # Sequential processing with a delay @@ -1003,6 +1228,24 @@ def process_sections(url: str, sections: list, provider: str, api_token: str, ba return extracted_content def wrap_text(draw, text, font, max_width): + """ + Wrap text to fit within a specified width for rendering. + + How it works: + 1. Splits the text into words. + 2. Constructs lines that fit within the maximum width using the provided font. + 3. Returns the wrapped text as a single string. + + Args: + draw (ImageDraw.Draw): The drawing context for measuring text size. + text (str): The text to wrap. + font (ImageFont.FreeTypeFont): The font to use for measuring text size. + max_width (int): The maximum width for each line. + + Returns: + str: The wrapped text. + """ + # Wrap the text to fit within the specified width lines = [] words = text.split() @@ -1014,6 +1257,21 @@ def wrap_text(draw, text, font, max_width): return '\n'.join(lines) def format_html(html_string): + """ + Prettify an HTML string using BeautifulSoup. + + How it works: + 1. Parses the HTML string with BeautifulSoup. + 2. Formats the HTML with proper indentation. + 3. Returns the prettified HTML string. + + Args: + html_string (str): The HTML string to format. + + Returns: + str: The prettified HTML string. + """ + soup = BeautifulSoup(html_string, 'lxml.parser') return soup.prettify() @@ -1110,23 +1368,94 @@ def normalize_url_tmp(href, base_url): return href.strip() -def is_external_url(url, base_domain): - """Determine if a URL is external""" - special_protocols = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} - if any(url.lower().startswith(proto) for proto in special_protocols): +def get_base_domain(url: str) -> str: + """ + Extract the base domain from a given URL, handling common edge cases. + + How it works: + 1. Parses the URL to extract the domain. + 2. Removes the port number and 'www' prefix. + 3. Handles special domains (e.g., 'co.uk') to extract the correct base. + + Args: + url (str): The URL to extract the base domain from. + + Returns: + str: The extracted base domain or an empty string if parsing fails. + """ + try: + # Get domain from URL + domain = urlparse(url).netloc.lower() + if not domain: + return "" + + # Remove port if present + domain = domain.split(':')[0] + + # Remove www + domain = re.sub(r'^www\.', '', domain) + + # Extract last two parts of domain (handles co.uk etc) + parts = domain.split('.') + if len(parts) > 2 and parts[-2] in { + 'co', 'com', 'org', 'gov', 'edu', 'net', + 'mil', 'int', 'ac', 'ad', 'ae', 'af', 'ag' + }: + return '.'.join(parts[-3:]) + + return '.'.join(parts[-2:]) + except Exception: + return "" + +def is_external_url(url: str, base_domain: str) -> bool: + """ + Extract the base domain from a given URL, handling common edge cases. + + How it works: + 1. Parses the URL to extract the domain. + 2. Removes the port number and 'www' prefix. + 3. Handles special domains (e.g., 'co.uk') to extract the correct base. + + Args: + url (str): The URL to extract the base domain from. + + Returns: + str: The extracted base domain or an empty string if parsing fails. + """ + special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'} + if any(url.lower().startswith(p) for p in special): return True try: - # Handle URLs with protocol - if url.startswith(('http://', 'https://')): - url_domain = url.split('/')[2] - return base_domain.lower() not in url_domain.lower() - except IndexError: - return False + parsed = urlparse(url) + if not parsed.netloc: # Relative URL + return False + + # Strip 'www.' from both domains for comparison + url_domain = parsed.netloc.lower().replace('www.', '') + base = base_domain.lower().replace('www.', '') - return False + # Check if URL domain ends with base domain + return not url_domain.endswith(base) + except Exception: + return False def clean_tokens(tokens: list[str]) -> list[str]: + """ + Clean a list of tokens by removing noise, stop words, and short tokens. + + How it works: + 1. Defines a set of noise words and stop words. + 2. Filters tokens based on length and exclusion criteria. + 3. Excludes tokens starting with certain symbols (e.g., "↑", "▲"). + + Args: + tokens (list[str]): The list of tokens to clean. + + Returns: + list[str]: The cleaned list of tokens. + """ + # Set of tokens to remove noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'} @@ -1182,6 +1511,21 @@ def clean_tokens(tokens: list[str]) -> list[str]: and not token.startswith('⬆')] def profile_and_time(func): + """ + Decorator to profile a function's execution time and performance. + + How it works: + 1. Records the start time before executing the function. + 2. Profiles the function's execution using `cProfile`. + 3. Prints the elapsed time and profiling statistics. + + Args: + func (Callable): The function to decorate. + + Returns: + Callable: The decorated function with profiling and timing enabled. + """ + @wraps(func) def wrapper(self, *args, **kwargs): # Start timer @@ -1289,4 +1633,7 @@ def get_error_context(exc_info, context_lines: int = 5): "line_no": line_no, "function": func_name, "code_context": code_context - } \ No newline at end of file + } + + + \ No newline at end of file diff --git a/docs/examples/amazon_product_extraction_direct_url.py b/docs/examples/amazon_product_extraction_direct_url.py new file mode 100644 index 00000000..769c479e --- /dev/null +++ b/docs/examples/amazon_product_extraction_direct_url.py @@ -0,0 +1,114 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + browser_type="chromium", + headless=True + ) + + # Initialize crawler config with JSON CSS extraction strategy + crawler_config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + # Example search URL (you should replace with your actual Amazon URL) + url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab" + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/amazon_product_extraction_using_hooks.py b/docs/examples/amazon_product_extraction_using_hooks.py new file mode 100644 index 00000000..a17d60c5 --- /dev/null +++ b/docs/examples/amazon_product_extraction_using_hooks.py @@ -0,0 +1,145 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json +from playwright.async_api import Page, BrowserContext + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + # browser_type="chromium", + headless=True + ) + + # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + url = "https://www.amazon.com/" + + async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs): + """Hook called after navigating to each URL""" + print(f"[HOOK] after_goto - Successfully loaded: {url}") + + try: + # Wait for search box to be available + search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000) + + # Type the search query + await search_box.fill('Samsung Galaxy Tab') + + # Get the search button and prepare for navigation + search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000) + + # Click with navigation waiting + await search_button.click() + + # Wait for search results to load + await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000) + print("[HOOK] Search completed and results loaded!") + + except Exception as e: + print(f"[HOOK] Error during search operation: {str(e)}") + + return page + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + + crawler.crawler_strategy.set_hook("after_goto", after_goto) + + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/amazon_product_extraction_using_use_javascript.py b/docs/examples/amazon_product_extraction_using_use_javascript.py new file mode 100644 index 00000000..15e5d6f5 --- /dev/null +++ b/docs/examples/amazon_product_extraction_using_use_javascript.py @@ -0,0 +1,129 @@ +""" +This example demonstrates how to use JSON CSS extraction to scrape product information +from Amazon search results. It shows how to extract structured data like product titles, +prices, ratings, and other details using CSS selectors. +""" + +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +import json +from playwright.async_api import Page, BrowserContext + +async def extract_amazon_products(): + # Initialize browser config + browser_config = BrowserConfig( + # browser_type="chromium", + headless=True + ) + + js_code_to_search = """ + const task = async () => { + document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab'; + document.querySelector('#nav-search-submit-button').click(); + } + await task(); + """ + js_code_to_search_sync = """ + document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab'; + document.querySelector('#nav-search-submit-button').click(); + """ + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + js_code = js_code_to_search, + wait_for='css:[data-component-type="s-search-result"]', + extraction_strategy=JsonCssExtractionStrategy( + schema={ + "name": "Amazon Product Search Results", + "baseSelector": "[data-component-type='s-search-result']", + "fields": [ + { + "name": "asin", + "selector": "", + "type": "attribute", + "attribute": "data-asin" + }, + { + "name": "title", + "selector": "h2 a span", + "type": "text" + }, + { + "name": "url", + "selector": "h2 a", + "type": "attribute", + "attribute": "href" + }, + { + "name": "image", + "selector": ".s-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "rating", + "selector": ".a-icon-star-small .a-icon-alt", + "type": "text" + }, + { + "name": "reviews_count", + "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span", + "type": "text" + }, + { + "name": "price", + "selector": ".a-price .a-offscreen", + "type": "text" + }, + { + "name": "original_price", + "selector": ".a-price.a-text-price .a-offscreen", + "type": "text" + }, + { + "name": "sponsored", + "selector": ".puis-sponsored-label-text", + "type": "exists" + }, + { + "name": "delivery_info", + "selector": "[data-cy='delivery-recipe'] .a-color-base", + "type": "text", + "multiple": True + } + ] + } + ) + ) + + # Example search URL (you should replace with your actual Amazon URL) + url = "https://www.amazon.com/" + + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Extract the data + result = await crawler.arun(url=url, config=crawler_config) + + # Process and print the results + if result and result.extracted_content: + # Parse the JSON string into a list of products + products = json.loads(result.extracted_content) + + # Process each product in the list + for product in products: + print("\nProduct Details:") + print(f"ASIN: {product.get('asin')}") + print(f"Title: {product.get('title')}") + print(f"Price: {product.get('price')}") + print(f"Original Price: {product.get('original_price')}") + print(f"Rating: {product.get('rating')}") + print(f"Reviews: {product.get('reviews_count')}") + print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}") + if product.get('delivery_info'): + print(f"Delivery: {' '.join(product['delivery_info'])}") + print("-" * 80) + +if __name__ == "__main__": + import asyncio + asyncio.run(extract_amazon_products()) diff --git a/docs/examples/browser_optimization_example.py b/docs/examples/browser_optimization_example.py new file mode 100644 index 00000000..f57dc147 --- /dev/null +++ b/docs/examples/browser_optimization_example.py @@ -0,0 +1,128 @@ +""" +This example demonstrates optimal browser usage patterns in Crawl4AI: +1. Sequential crawling with session reuse +2. Parallel crawling with browser instance reuse +3. Performance optimization settings +""" + +import asyncio +import os +from typing import List +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + + +async def crawl_sequential(urls: List[str]): + """ + Sequential crawling using session reuse - most efficient for moderate workloads + """ + print("\n=== Sequential Crawling with Session Reuse ===") + + # Configure browser with optimized settings + browser_config = BrowserConfig( + headless=True, + browser_args=[ + "--disable-gpu", # Disable GPU acceleration + "--disable-dev-shm-usage", # Disable /dev/shm usage + "--no-sandbox", # Required for Docker + ], + viewport={ + "width": 800, + "height": 600, + }, # Smaller viewport for better performance + ) + + # Configure crawl settings + crawl_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + # content_filter=PruningContentFilter(), In case you need fit_markdown + ), + ) + + # Create single crawler instance + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + session_id = "session1" # Use same session for all URLs + for url in urls: + result = await crawler.arun( + url=url, + config=crawl_config, + session_id=session_id, # Reuse same browser tab + ) + if result.success: + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown_v2.raw_markdown)}") + finally: + await crawler.close() + + +async def crawl_parallel(urls: List[str], max_concurrent: int = 3): + """ + Parallel crawling while reusing browser instance - best for large workloads + """ + print("\n=== Parallel Crawling with Browser Reuse ===") + + browser_config = BrowserConfig( + headless=True, + browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], + viewport={"width": 800, "height": 600}, + ) + + crawl_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + # content_filter=PruningContentFilter(), In case you need fit_markdown + ), + ) + + # Create single crawler instance for all parallel tasks + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + try: + # Create tasks in batches to control concurrency + for i in range(0, len(urls), max_concurrent): + batch = urls[i : i + max_concurrent] + tasks = [] + + for j, url in enumerate(batch): + session_id = ( + f"parallel_session_{j}" # Different session per concurrent task + ) + task = crawler.arun(url=url, config=crawl_config, session_id=session_id) + tasks.append(task) + + # Wait for batch to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + for url, result in zip(batch, results): + if isinstance(result, Exception): + print(f"Error crawling {url}: {str(result)}") + elif result.success: + print(f"Successfully crawled {url}") + print(f"Content length: {len(result.markdown_v2.raw_markdown)}") + finally: + await crawler.close() + + +async def main(): + # Example URLs + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + "https://example.com/page4", + ] + + # Demo sequential crawling + await crawl_sequential(urls) + + # Demo parallel crawling + await crawl_parallel(urls, max_concurrent=2) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/extraction_strategies_example.py b/docs/examples/extraction_strategies_example.py new file mode 100644 index 00000000..348b891e --- /dev/null +++ b/docs/examples/extraction_strategies_example.py @@ -0,0 +1,115 @@ +""" +Example demonstrating different extraction strategies with various input formats. +This example shows how to: +1. Use different input formats (markdown, HTML, fit_markdown) +2. Work with JSON-based extractors (CSS and XPath) +3. Use LLM-based extraction with different input formats +4. Configure browser and crawler settings properly +""" + +import asyncio +import os +from typing import Dict, Any + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import ( + LLMExtractionStrategy, + JsonCssExtractionStrategy, + JsonXPathExtractionStrategy +) +from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str): + """Helper function to run extraction with proper configuration""" + try: + # Configure the crawler run settings + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() # For fit_markdown support + ) + ) + + # Run the crawler + result = await crawler.arun(url=url, config=config) + + if result.success: + print(f"\n=== {name} Results ===") + print(f"Extracted Content: {result.extracted_content}") + print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}") + print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}") + else: + print(f"Error in {name}: Crawl failed") + + except Exception as e: + print(f"Error in {name}: {str(e)}") + +async def main(): + # Example URL (replace with actual URL) + url = "https://example.com/product-page" + + # Configure browser settings + browser_config = BrowserConfig( + headless=True, + verbose=True + ) + + # Initialize extraction strategies + + # 1. LLM Extraction with different input formats + markdown_strategy = LLMExtractionStrategy( + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information including name, price, and description" + ) + + html_strategy = LLMExtractionStrategy( + input_format="html", + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information from HTML including structured data" + ) + + fit_markdown_strategy = LLMExtractionStrategy( + input_format="fit_markdown", + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), + instruction="Extract product information from cleaned markdown" + ) + + # 2. JSON CSS Extraction (automatically uses HTML input) + css_schema = { + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h1.product-title", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "description", "selector": ".description", "type": "text"} + ] + } + css_strategy = JsonCssExtractionStrategy(schema=css_schema) + + # 3. JSON XPath Extraction (automatically uses HTML input) + xpath_schema = { + "baseSelector": "//div[@class='product']", + "fields": [ + {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"}, + {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"}, + {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"} + ] + } + xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema) + + # Use context manager for proper resource handling + async with AsyncWebCrawler(config=browser_config) as crawler: + # Run all strategies + await run_extraction(crawler, url, markdown_strategy, "Markdown LLM") + await run_extraction(crawler, url, html_strategy, "HTML LLM") + await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM") + await run_extraction(crawler, url, css_strategy, "CSS Extraction") + await run_extraction(crawler, url, xpath_strategy, "XPath Extraction") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md index 1afc24ba..8522675c 100644 --- a/docs/examples/full_page_screenshot_and_pdf_export.md +++ b/docs/examples/full_page_screenshot_and_pdf_export.md @@ -39,8 +39,8 @@ async def main(): f.write(b64decode(result.screenshot)) # Save PDF - if result.pdf_data: - pdf_bytes = b64decode(result.pdf_data) + if result.pdf: + pdf_bytes = b64decode(result.pdf) with open(os.path.join(__location__, "page.pdf"), "wb") as f: f.write(pdf_bytes) diff --git a/docs/examples/hooks_example.py b/docs/examples/hooks_example.py new file mode 100644 index 00000000..09e0bc17 --- /dev/null +++ b/docs/examples/hooks_example.py @@ -0,0 +1,107 @@ +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from playwright.async_api import Page, BrowserContext + +async def main(): + print("🔗 Hooks Example: Demonstrating different hook use cases") + + # Configure browser settings + browser_config = BrowserConfig( + headless=True + ) + + # Configure crawler settings + crawler_run_config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="body", + cache_mode=CacheMode.BYPASS + ) + + # Create crawler instance + crawler = AsyncWebCrawler(config=browser_config) + + # Define and set hook functions + async def on_browser_created(browser, context: BrowserContext, **kwargs): + """Hook called after the browser is created""" + print("[HOOK] on_browser_created - Browser is ready!") + # Example: Set a cookie that will be used for all requests + return browser + + async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): + """Hook called after a new page and context are created""" + print("[HOOK] on_page_context_created - New page created!") + # Example: Set default viewport size + await context.add_cookies([{ + 'name': 'session_id', + 'value': 'example_session', + 'domain': '.example.com', + 'path': '/' + }]) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs): + """Hook called when the user agent is updated""" + print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}") + return page + + async def on_execution_started(page: Page, context: BrowserContext, **kwargs): + """Hook called after custom JavaScript execution""" + print("[HOOK] on_execution_started - Custom JS executed!") + return page + + async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs): + """Hook called before navigating to each URL""" + print(f"[HOOK] before_goto - About to visit: {url}") + # Example: Add custom headers for the request + await page.set_extra_http_headers({ + "Custom-Header": "my-value" + }) + return page + + async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs): + """Hook called after navigating to each URL""" + print(f"[HOOK] after_goto - Successfully loaded: {url}") + # Example: Wait for a specific element to be loaded + try: + await page.wait_for_selector('.content', timeout=1000) + print("Content element found!") + except: + print("Content element not found, continuing anyway") + return page + + async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): + """Hook called before retrieving the HTML content""" + print("[HOOK] before_retrieve_html - About to get HTML content") + # Example: Scroll to bottom to trigger lazy loading + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + return page + + async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs): + """Hook called before returning the HTML content""" + print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})") + # Example: You could modify the HTML content here if needed + return page + + # Set all the hooks + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated) + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) + crawler.crawler_strategy.set_hook("before_return_html", before_return_html) + + await crawler.start() + + # Example usage: crawl a simple website + url = 'https://example.com' + result = await crawler.arun(url, config=crawler_run_config) + print(f"\nCrawled URL: {result.url}") + print(f"HTML length: {len(result.html)}") + + await crawler.close() + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart_async.config.py index ff312688..4c4a9d86 100644 --- a/docs/examples/quickstart_async.config.py +++ b/docs/examples/quickstart_async.config.py @@ -1,6 +1,8 @@ import os, sys -sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) -os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692" + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +) import asyncio import time @@ -12,7 +14,10 @@ from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai.extraction_strategy import ( + JsonCssExtractionStrategy, + LLMExtractionStrategy, +) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) @@ -21,128 +26,182 @@ print("GitHub Repository: https://github.com/unclecode/crawl4ai") print("Twitter: @unclecode") print("Website: https://crawl4ai.com") + # Basic Example - Simple Crawl async def simple_crawl(): print("\n--- Basic Usage ---") browser_config = BrowserConfig(headless=True) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS - ) - + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=crawler_config + url="https://www.nbcnews.com/business", config=crawler_config ) print(result.markdown[:500]) + +async def clean_content(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + excluded_tags=["nav", "footer", "aside"], + remove_overlay_elements=True, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, threshold_type="fixed", min_word_threshold=0 + ), + options={"ignore_links": True}, + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + config=crawler_config, + ) + full_markdown_length = len(result.markdown_v2.raw_markdown) + fit_markdown_length = len(result.markdown_v2.fit_markdown) + print(f"Full Markdown Length: {full_markdown_length}") + print(f"Fit Markdown Length: {fit_markdown_length}") + +async def link_analysis(): + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + exclude_external_links=True, + exclude_social_media_links=True, + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + config=crawler_config, + ) + print(f"Found {len(result.links['internal'])} internal links") + print(f"Found {len(result.links['external'])} external links") + + for link in result.links['internal'][:5]: + print(f"Href: {link['href']}\nText: {link['text']}\n") + # JavaScript Execution Example async def simple_example_with_running_js_code(): print("\n--- Executing JavaScript and Using CSS Selectors ---") - - browser_config = BrowserConfig( - headless=True, - java_script_enabled=True - ) - + + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, - js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"], + js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();", # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }" ) - + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=crawler_config + url="https://www.nbcnews.com/business", config=crawler_config ) print(result.markdown[:500]) + # CSS Selector Example async def simple_example_with_css_selector(): print("\n--- Using CSS Selectors ---") browser_config = BrowserConfig(headless=True) crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - css_selector=".wide-tease-item__description" + cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description" ) - + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", config=crawler_config + ) + print(result.markdown[:500]) + +async def media_handling(): + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True) + async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", config=crawler_config ) - print(result.markdown[:500]) + for img in result.media['images'][:5]: + print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}") + +async def custom_hook_workflow(verbose=True): + async with AsyncWebCrawler() as crawler: + # Set a 'before_goto' hook to run custom code just before navigation + crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate...")) + + # Perform the crawl operation + result = await crawler.arun( + url="https://crawl4ai.com" + ) + print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- ")) + # Proxy Example async def use_proxy(): print("\n--- Using a Proxy ---") browser_config = BrowserConfig( headless=True, - proxy="http://your-proxy-url:port" + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "username", + "password": "password", + }, ) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS - ) - + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=crawler_config + url="https://www.nbcnews.com/business", config=crawler_config ) if result.success: print(result.markdown[:500]) + # Screenshot Example async def capture_and_save_screenshot(url: str, output_path: str): browser_config = BrowserConfig(headless=True) - crawler_config = CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - screenshot=True - ) - + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True) + async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - url=url, - config=crawler_config - ) - + result = await crawler.arun(url=url, config=crawler_config) + if result.success and result.screenshot: import base64 + screenshot_data = base64.b64decode(result.screenshot) - with open(output_path, 'wb') as f: + with open(output_path, "wb") as f: f.write(screenshot_data) print(f"Screenshot saved successfully to {output_path}") else: print("Failed to capture screenshot") + # LLM Extraction Example class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") - output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + output_fee: str = Field( + ..., description="Fee for output token for the OpenAI model." + ) -async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): + +async def extract_structured_data_using_llm( + provider: str, api_token: str = None, extra_headers: Dict[str, str] = None +): print(f"\n--- Extracting Structured Data with {provider} ---") - + if api_token is None and provider != "ollama": print(f"API token is required for {provider}. Skipping this example.") return browser_config = BrowserConfig(headless=True) - - extra_args = { - "temperature": 0, - "top_p": 0.9, - "max_tokens": 2000 - } + + extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000} if extra_headers: extra_args["extra_headers"] = extra_headers crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, word_count_threshold=1, - page_timeout = 80000, + page_timeout=80000, extraction_strategy=LLMExtractionStrategy( provider=provider, api_token=api_token, @@ -150,17 +209,17 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. Do not miss any models in the entire content.""", - extra_args=extra_args - ) + extra_args=extra_args, + ), ) - + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://openai.com/api/pricing/", - config=crawler_config + url="https://openai.com/api/pricing/", config=crawler_config ) print(result.extracted_content) + # CSS Extraction Example async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") @@ -192,16 +251,13 @@ async def extract_structured_data_using_css_extractor(): "name": "course_icon", "selector": ".image-92", "type": "attribute", - "attribute": "src" - } - ] + "attribute": "src", + }, + ], } - browser_config = BrowserConfig( - headless=True, - java_script_enabled=True - ) - + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + js_click_tabs = """ (async () => { const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); @@ -212,23 +268,23 @@ async def extract_structured_data_using_css_extractor(): } })(); """ - + crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=JsonCssExtractionStrategy(schema), - js_code=[js_click_tabs] + js_code=[js_click_tabs], ) - + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://www.kidocode.com/degrees/technology", - config=crawler_config + url="https://www.kidocode.com/degrees/technology", config=crawler_config ) companies = json.loads(result.extracted_content) print(f"Successfully extracted {len(companies)} companies") print(json.dumps(companies[0], indent=2)) + # Dynamic Content Examples - Method 1 async def crawl_dynamic_content_pages_method_1(): print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") @@ -249,10 +305,7 @@ async def crawl_dynamic_content_pages_method_1(): except Exception as e: print(f"Warning: New content didn't appear after JavaScript execution: {e}") - browser_config = BrowserConfig( - headless=False, - java_script_enabled=True - ) + browser_config = BrowserConfig(headless=False, java_script_enabled=True) async with AsyncWebCrawler(config=browser_config) as crawler: crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) @@ -272,7 +325,7 @@ async def crawl_dynamic_content_pages_method_1(): css_selector="li.Box-sc-g0xbh4-0", js_code=js_next_page if page > 0 else None, js_only=page > 0, - session_id=session_id + session_id=session_id, ) result = await crawler.arun(url=url, config=crawler_config) @@ -286,14 +339,12 @@ async def crawl_dynamic_content_pages_method_1(): print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + # Dynamic Content Examples - Method 2 async def crawl_dynamic_content_pages_method_2(): print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---") - browser_config = BrowserConfig( - headless=False, - java_script_enabled=True - ) + browser_config = BrowserConfig(headless=False, java_script_enabled=True) js_next_page_and_wait = """ (async () => { @@ -343,7 +394,7 @@ async def crawl_dynamic_content_pages_method_2(): extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, js_only=page > 0, - session_id=session_id + session_id=session_id, ) result = await crawler.arun(url=url, config=crawler_config) @@ -355,88 +406,128 @@ async def crawl_dynamic_content_pages_method_2(): print(f"Successfully crawled {len(all_commits)} commits across 3 pages") + +async def cosine_similarity_extraction(): + crawl_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=CosineStrategy( + word_count_threshold=10, + max_dist=0.2, # Maximum distance between two words + linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single) + top_k=3, # Number of top keywords to extract + sim_threshold=0.3, # Similarity threshold for clustering + semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings + verbose=True + ), + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156", + config=crawl_config + ) + print(json.loads(result.extracted_content)[:5]) + # Browser Comparison async def crawl_custom_browser_type(): print("\n--- Browser Comparison ---") - + # Firefox - browser_config_firefox = BrowserConfig( - browser_type="firefox", - headless=True - ) + browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True) start = time.time() async with AsyncWebCrawler(config=browser_config_firefox) as crawler: result = await crawler.arun( url="https://www.example.com", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), ) print("Firefox:", time.time() - start) print(result.markdown[:500]) # WebKit - browser_config_webkit = BrowserConfig( - browser_type="webkit", - headless=True - ) + browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True) start = time.time() async with AsyncWebCrawler(config=browser_config_webkit) as crawler: result = await crawler.arun( url="https://www.example.com", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), ) print("WebKit:", time.time() - start) print(result.markdown[:500]) # Chromium (default) - browser_config_chromium = BrowserConfig( - browser_type="chromium", - headless=True - ) + browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True) start = time.time() async with AsyncWebCrawler(config=browser_config_chromium) as crawler: result = await crawler.arun( url="https://www.example.com", - config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), ) print("Chromium:", time.time() - start) print(result.markdown[:500]) + # Anti-Bot and User Simulation async def crawl_with_user_simulation(): browser_config = BrowserConfig( headless=True, user_agent_mode="random", - user_agent_generator_config={ - "device_type": "mobile", - "os_type": "android" - } + user_agent_generator_config={"device_type": "mobile", "os_type": "android"}, ) crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, magic=True, simulate_user=True, - override_navigator=True + override_navigator=True, ) async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - url="YOUR-URL-HERE", - config=crawler_config - ) + result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config) print(result.markdown) +async def ssl_certification(): + # Configure crawler to fetch SSL certificate + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url='https://example.com', + config=config + ) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + # 1. Access certificate properties directly + print("\nCertificate Information:") + print(f"Issuer: {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # 2. Export certificate in different formats + cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis + print("\nCertificate exported to:") + print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}") + + pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers + print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}") + + der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps + print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") + # Speed Comparison async def speed_comparison(): print("\n--- Speed Comparison ---") - + # Firecrawl comparison from firecrawl import FirecrawlApp - app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) + + app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"]) start = time.time() scrape_status = app.scrape_url( - 'https://www.nbcnews.com/business', - params={'formats': ['markdown', 'html']} + "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]} ) end = time.time() print("Firecrawl:") @@ -447,16 +538,15 @@ async def speed_comparison(): # Crawl4AI comparisons browser_config = BrowserConfig(headless=True) - + # Simple crawl async with AsyncWebCrawler(config=browser_config) as crawler: start = time.time() result = await crawler.arun( url="https://www.nbcnews.com/business", config=CrawlerRunConfig( - cache_mode=CacheMode.BYPASS, - word_count_threshold=0 - ) + cache_mode=CacheMode.BYPASS, word_count_threshold=0 + ), ) end = time.time() print("Crawl4AI (simple crawl):") @@ -474,12 +564,10 @@ async def speed_comparison(): word_count_threshold=0, markdown_generator=DefaultMarkdownGenerator( content_filter=PruningContentFilter( - threshold=0.48, - threshold_type="fixed", - min_word_threshold=0 + threshold=0.48, threshold_type="fixed", min_word_threshold=0 ) - ) - ) + ), + ), ) end = time.time() print("Crawl4AI (Markdown Plus):") @@ -489,22 +577,25 @@ async def speed_comparison(): print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}") print() + # Main execution async def main(): # Basic examples # await simple_crawl() # await simple_example_with_running_js_code() # await simple_example_with_css_selector() - + # Advanced examples # await extract_structured_data_using_css_extractor() - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm( + "openai/gpt-4o", os.getenv("OPENAI_API_KEY") + ) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() - + # Browser comparisons # await crawl_custom_browser_type() - + # Performance testing # await speed_comparison() @@ -514,5 +605,6 @@ async def main(): # os.path.join(__location__, "tmp/example_screenshot.jpg") # ) + if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index bd4c425f..e640e6bd 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -627,13 +627,13 @@ async def main(): # } # await extract_structured_data_using_llm(extra_headers=custom_headers) - await crawl_dynamic_content_pages_method_1() - await crawl_dynamic_content_pages_method_2() + # await crawl_dynamic_content_pages_method_1() + # await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3() - await crawl_custom_browser_type() + # await crawl_custom_browser_type() - await speed_comparison() + # await speed_comparison() if __name__ == "__main__": diff --git a/docs/examples/ssl_example.py b/docs/examples/ssl_example.py new file mode 100644 index 00000000..410e9485 --- /dev/null +++ b/docs/examples/ssl_example.py @@ -0,0 +1,46 @@ +"""Example showing how to work with SSL certificates in Crawl4AI.""" + +import asyncio +import os +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +# Create tmp directory if it doesn't exist +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +tmp_dir = os.path.join(parent_dir, "tmp") +os.makedirs(tmp_dir, exist_ok=True) + +async def main(): + # Configure crawler to fetch SSL certificate + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url='https://example.com', + config=config + ) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + # 1. Access certificate properties directly + print("\nCertificate Information:") + print(f"Issuer: {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # 2. Export certificate in different formats + cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis + print("\nCertificate exported to:") + print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}") + + pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers + print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}") + + der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps + print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/tmp/chainlit_review.py b/docs/examples/tmp/chainlit_review.py deleted file mode 100644 index 2c03d17d..00000000 --- a/docs/examples/tmp/chainlit_review.py +++ /dev/null @@ -1,281 +0,0 @@ -from openai import AsyncOpenAI -from chainlit.types import ThreadDict -import chainlit as cl -from chainlit.input_widget import Select, Switch, Slider -client = AsyncOpenAI() - -# Instrument the OpenAI client -cl.instrument_openai() - -settings = { - "model": "gpt-3.5-turbo", - "temperature": 0.5, - "max_tokens": 500, - "top_p": 1, - "frequency_penalty": 0, - "presence_penalty": 0, -} - -@cl.action_callback("action_button") -async def on_action(action: cl.Action): - print("The user clicked on the action button!") - - return "Thank you for clicking on the action button!" - -@cl.set_chat_profiles -async def chat_profile(): - return [ - cl.ChatProfile( - name="GPT-3.5", - markdown_description="The underlying LLM model is **GPT-3.5**.", - icon="https://picsum.photos/200", - ), - cl.ChatProfile( - name="GPT-4", - markdown_description="The underlying LLM model is **GPT-4**.", - icon="https://picsum.photos/250", - ), - ] - -@cl.on_chat_start -async def on_chat_start(): - - settings = await cl.ChatSettings( - [ - Select( - id="Model", - label="OpenAI - Model", - values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"], - initial_index=0, - ), - Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True), - Slider( - id="Temperature", - label="OpenAI - Temperature", - initial=1, - min=0, - max=2, - step=0.1, - ), - Slider( - id="SAI_Steps", - label="Stability AI - Steps", - initial=30, - min=10, - max=150, - step=1, - description="Amount of inference steps performed on image generation.", - ), - Slider( - id="SAI_Cfg_Scale", - label="Stability AI - Cfg_Scale", - initial=7, - min=1, - max=35, - step=0.1, - description="Influences how strongly your generation is guided to match your prompt.", - ), - Slider( - id="SAI_Width", - label="Stability AI - Image Width", - initial=512, - min=256, - max=2048, - step=64, - tooltip="Measured in pixels", - ), - Slider( - id="SAI_Height", - label="Stability AI - Image Height", - initial=512, - min=256, - max=2048, - step=64, - tooltip="Measured in pixels", - ), - ] - ).send() - - chat_profile = cl.user_session.get("chat_profile") - await cl.Message( - content=f"starting chat using the {chat_profile} chat profile" - ).send() - - print("A new chat session has started!") - cl.user_session.set("session", { - "history": [], - "context": [] - }) - - image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline") - - # Attach the image to the message - await cl.Message( - content="You are such a good girl, aren't you?!", - elements=[image], - ).send() - - text_content = "Hello, this is a text element." - elements = [ - cl.Text(name="simple_text", content=text_content, display="inline") - ] - - await cl.Message( - content="Check out this text element!", - elements=elements, - ).send() - - elements = [ - cl.Audio(path="./assets/audio.mp3", display="inline"), - ] - await cl.Message( - content="Here is an audio file", - elements=elements, - ).send() - - await cl.Avatar( - name="Tool 1", - url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4", - ).send() - - await cl.Message( - content="This message should not have an avatar!", author="Tool 0" - ).send() - - await cl.Message( - content="This message should have an avatar!", author="Tool 1" - ).send() - - elements = [ - cl.File( - name="quickstart.py", - path="./quickstart.py", - display="inline", - ), - ] - - await cl.Message( - content="This message has a file element", elements=elements - ).send() - - # Sending an action button within a chatbot message - actions = [ - cl.Action(name="action_button", value="example_value", description="Click me!") - ] - - await cl.Message(content="Interact with this action button:", actions=actions).send() - - # res = await cl.AskActionMessage( - # content="Pick an action!", - # actions=[ - # cl.Action(name="continue", value="continue", label="✅ Continue"), - # cl.Action(name="cancel", value="cancel", label="❌ Cancel"), - # ], - # ).send() - - # if res and res.get("value") == "continue": - # await cl.Message( - # content="Continue!", - # ).send() - - # import plotly.graph_objects as go - # fig = go.Figure( - # data=[go.Bar(y=[2, 1, 3])], - # layout_title_text="An example figure", - # ) - # elements = [cl.Plotly(name="chart", figure=fig, display="inline")] - - # await cl.Message(content="This message has a chart", elements=elements).send() - - # Sending a pdf with the local file path - # elements = [ - # cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf") - # ] - - # cl.Message(content="Look at this local pdf!", elements=elements).send() - -@cl.on_settings_update -async def setup_agent(settings): - print("on_settings_update", settings) - -@cl.on_stop -def on_stop(): - print("The user wants to stop the task!") - -@cl.on_chat_end -def on_chat_end(): - print("The user disconnected!") - - -@cl.on_chat_resume -async def on_chat_resume(thread: ThreadDict): - print("The user resumed a previous chat session!") - - - - -# @cl.on_message -async def on_message(message: cl.Message): - cl.user_session.get("session")["history"].append({ - "role": "user", - "content": message.content - }) - response = await client.chat.completions.create( - messages=[ - { - "content": "You are a helpful bot", - "role": "system" - }, - *cl.user_session.get("session")["history"] - ], - **settings - ) - - - # Add assitanr message to the history - cl.user_session.get("session")["history"].append({ - "role": "assistant", - "content": response.choices[0].message.content - }) - - # msg.content = response.choices[0].message.content - # await msg.update() - - # await cl.Message(content=response.choices[0].message.content).send() - -@cl.on_message -async def on_message(message: cl.Message): - cl.user_session.get("session")["history"].append({ - "role": "user", - "content": message.content - }) - - msg = cl.Message(content="") - await msg.send() - - stream = await client.chat.completions.create( - messages=[ - { - "content": "You are a helpful bot", - "role": "system" - }, - *cl.user_session.get("session")["history"] - ], - stream = True, - **settings - ) - - async for part in stream: - if token := part.choices[0].delta.content or "": - await msg.stream_token(token) - - # Add assitanr message to the history - cl.user_session.get("session")["history"].append({ - "role": "assistant", - "content": msg.content - }) - await msg.update() - -if __name__ == "__main__": - from chainlit.cli import run_chainlit - run_chainlit(__file__) \ No newline at end of file diff --git a/docs/examples/tmp/research_assistant_audio_not_completed.py b/docs/examples/tmp/research_assistant_audio_not_completed.py deleted file mode 100644 index e0ad2b4f..00000000 --- a/docs/examples/tmp/research_assistant_audio_not_completed.py +++ /dev/null @@ -1,238 +0,0 @@ -# Make sure to install the required packageschainlit and groq -import os, time -from openai import AsyncOpenAI -import chainlit as cl -import re -import requests -from io import BytesIO -from chainlit.element import ElementBased -from groq import Groq - -# Import threadpools to run the crawl_url function in a separate thread -from concurrent.futures import ThreadPoolExecutor - -client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY")) - -# Instrument the OpenAI client -cl.instrument_openai() - -settings = { - "model": "llama3-8b-8192", - "temperature": 0.5, - "max_tokens": 500, - "top_p": 1, - "frequency_penalty": 0, - "presence_penalty": 0, -} - -def extract_urls(text): - url_pattern = re.compile(r'(https?://\S+)') - return url_pattern.findall(text) - -def crawl_url(url): - data = { - "urls": [url], - "include_raw_html": True, - "word_count_threshold": 10, - "extraction_strategy": "NoExtractionStrategy", - "chunking_strategy": "RegexChunking" - } - response = requests.post("https://crawl4ai.com/crawl", json=data) - response_data = response.json() - response_data = response_data['results'][0] - return response_data['markdown'] - -@cl.on_chat_start -async def on_chat_start(): - cl.user_session.set("session", { - "history": [], - "context": {} - }) - await cl.Message( - content="Welcome to the chat! How can I assist you today?" - ).send() - -@cl.on_message -async def on_message(message: cl.Message): - user_session = cl.user_session.get("session") - - # Extract URLs from the user's message - urls = extract_urls(message.content) - - - futures = [] - with ThreadPoolExecutor() as executor: - for url in urls: - futures.append(executor.submit(crawl_url, url)) - - results = [future.result() for future in futures] - - for url, result in zip(urls, results): - ref_number = f"REF_{len(user_session['context']) + 1}" - user_session["context"][ref_number] = { - "url": url, - "content": result - } - - # for url in urls: - # # Crawl the content of each URL and add it to the session context with a reference number - # ref_number = f"REF_{len(user_session['context']) + 1}" - # crawled_content = crawl_url(url) - # user_session["context"][ref_number] = { - # "url": url, - # "content": crawled_content - # } - - user_session["history"].append({ - "role": "user", - "content": message.content - }) - - # Create a system message that includes the context - context_messages = [ - f'\n{data["content"]}\n' - for ref, data in user_session["context"].items() - ] - if context_messages: - system_message = { - "role": "system", - "content": ( - "You are a helpful bot. Use the following context for answering questions. " - "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n" - "If the question requires any information from the provided appendices or context, refer to the sources. " - "If not, there is no need to add a references section. " - "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n" - "\n\n".join(context_messages) - ) - } - else: - system_message = { - "role": "system", - "content": "You are a helpful assistant." - } - - - msg = cl.Message(content="") - await msg.send() - - # Get response from the LLM - stream = await client.chat.completions.create( - messages=[ - system_message, - *user_session["history"] - ], - stream=True, - **settings - ) - - assistant_response = "" - async for part in stream: - if token := part.choices[0].delta.content: - assistant_response += token - await msg.stream_token(token) - - # Add assistant message to the history - user_session["history"].append({ - "role": "assistant", - "content": assistant_response - }) - await msg.update() - - # Append the reference section to the assistant's response - reference_section = "\n\nReferences:\n" - for ref, data in user_session["context"].items(): - reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n" - - msg.content += reference_section - await msg.update() - - -@cl.on_audio_chunk -async def on_audio_chunk(chunk: cl.AudioChunk): - if chunk.isStart: - buffer = BytesIO() - # This is required for whisper to recognize the file type - buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}" - # Initialize the session for a new audio stream - cl.user_session.set("audio_buffer", buffer) - cl.user_session.set("audio_mime_type", chunk.mimeType) - - # Write the chunks to a buffer and transcribe the whole audio at the end - cl.user_session.get("audio_buffer").write(chunk.data) - - pass - -@cl.step(type="tool") -async def speech_to_text(audio_file): - cli = Groq() - - # response = cli.audio.transcriptions.create( - # file=audio_file, #(filename, file.read()), - # model="whisper-large-v3", - # ) - - response = await client.audio.transcriptions.create( - model="whisper-large-v3", file=audio_file - ) - - return response.text - - -@cl.on_audio_end -async def on_audio_end(elements: list[ElementBased]): - # Get the audio buffer from the session - audio_buffer: BytesIO = cl.user_session.get("audio_buffer") - audio_buffer.seek(0) # Move the file pointer to the beginning - audio_file = audio_buffer.read() - audio_mime_type: str = cl.user_session.get("audio_mime_type") - - # input_audio_el = cl.Audio( - # mime=audio_mime_type, content=audio_file, name=audio_buffer.name - # ) - # await cl.Message( - # author="You", - # type="user_message", - # content="", - # elements=[input_audio_el, *elements] - # ).send() - - # answer_message = await cl.Message(content="").send() - - - start_time = time.time() - whisper_input = (audio_buffer.name, audio_file, audio_mime_type) - transcription = await speech_to_text(whisper_input) - end_time = time.time() - print(f"Transcription took {end_time - start_time} seconds") - - user_msg = cl.Message( - author="You", - type="user_message", - content=transcription - ) - await user_msg.send() - await on_message(user_msg) - - # images = [file for file in elements if "image" in file.mime] - - # text_answer = await generate_text_answer(transcription, images) - - # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type) - - # output_audio_el = cl.Audio( - # name=output_name, - # auto_play=True, - # mime=audio_mime_type, - # content=output_audio, - # ) - - # answer_message.elements = [output_audio_el] - - # answer_message.content = transcription - # await answer_message.update() - -if __name__ == "__main__": - from chainlit.cli import run_chainlit - run_chainlit(__file__) - - diff --git a/docs/md_v2/advanced/content-processing.md b/docs/md_v2/advanced/content-processing.md index 71a32438..25ed6172 100644 --- a/docs/md_v2/advanced/content-processing.md +++ b/docs/md_v2/advanced/content-processing.md @@ -2,80 +2,12 @@ Crawl4AI provides powerful content processing capabilities that help you extract clean, relevant content from web pages. This guide covers content cleaning, media handling, link analysis, and metadata extraction. -## Content Cleaning - -### Understanding Clean Content -When crawling web pages, you often encounter a lot of noise - advertisements, navigation menus, footers, popups, and other irrelevant content. Crawl4AI automatically cleans this noise using several approaches: - -1. **Basic Cleaning**: Removes unwanted HTML elements and attributes -2. **Content Relevance**: Identifies and preserves meaningful content blocks -3. **Layout Analysis**: Understands page structure to identify main content areas - -```python -result = await crawler.arun( - url="https://example.com", - word_count_threshold=10, # Remove blocks with fewer words - excluded_tags=['form', 'nav'], # Remove specific HTML tags - remove_overlay_elements=True # Remove popups/modals -) - -# Get clean content -print(result.cleaned_html) # Cleaned HTML -print(result.markdown) # Clean markdown version -``` - -### Fit Markdown: Smart Content Extraction -One of Crawl4AI's most powerful features is `fit_markdown`. This feature uses advanced heuristics to identify and extract the main content from a webpage while excluding irrelevant elements. - -#### How Fit Markdown Works -- Analyzes content density and distribution -- Identifies content patterns and structures -- Removes boilerplate content (headers, footers, sidebars) -- Preserves the most relevant content blocks -- Maintains content hierarchy and formatting - -#### Perfect For: -- Blog posts and articles -- News content -- Documentation pages -- Any page with a clear main content area - -#### Not Recommended For: -- E-commerce product listings -- Search results pages -- Social media feeds -- Pages with multiple equal-weight content sections - -```python -result = await crawler.arun(url="https://example.com") - -# Get the most relevant content -main_content = result.fit_markdown - -# Compare with regular markdown -all_content = result.markdown - -print(f"Fit Markdown Length: {len(main_content)}") -print(f"Regular Markdown Length: {len(all_content)}") -``` - -#### Example Use Case -```python -async def extract_article_content(url: str) -> str: - """Extract main article content from a blog or news site.""" - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url=url) - - # fit_markdown will focus on the article content, - # excluding navigation, ads, and other distractions - return result.fit_markdown -``` - ## Media Processing Crawl4AI provides comprehensive media extraction and analysis capabilities. It automatically detects and processes various types of media elements while maintaining their context and relevance. ### Image Processing + The library handles various image scenarios, including: - Regular images - Lazy-loaded images @@ -84,7 +16,10 @@ The library handles various image scenarios, including: - Image metadata and context ```python -result = await crawler.arun(url="https://example.com") +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) for image in result.media["images"]: # Each image includes rich metadata @@ -96,20 +31,27 @@ for image in result.media["images"]: ``` ### Handling Lazy-Loaded Content -Crawl4aai already handles lazy loading for media elements. You can also customize the wait time for lazy-loaded content: + +Crawl4AI already handles lazy loading for media elements. You can customize the wait time for lazy-loaded content with `CrawlerRunConfig`: ```python -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( wait_for="css:img[data-src]", # Wait for lazy images delay_before_return_html=2.0 # Additional wait time ) +result = await crawler.arun(url="https://example.com", config=config) ``` ### Video and Audio Content + The library extracts video and audio elements with their metadata: ```python +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) + # Process videos for video in result.media["videos"]: print(f"Video source: {video['src']}") @@ -129,6 +71,7 @@ for audio in result.media["audios"]: Crawl4AI provides sophisticated link analysis capabilities, helping you understand the relationship between pages and identify important navigation patterns. ### Link Classification + The library automatically categorizes links into: - Internal links (same domain) - External links (different domains) @@ -137,7 +80,10 @@ The library automatically categorizes links into: - Content links ```python -result = await crawler.arun(url="https://example.com") +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) # Analyze internal links for link in result.links["internal"]: @@ -154,18 +100,19 @@ for link in result.links["external"]: ``` ### Smart Link Filtering -Control which links are included in the results: + +Control which links are included in the results with `CrawlerRunConfig`: ```python -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( exclude_external_links=True, # Remove external links exclude_social_media_links=True, # Remove social media links - exclude_social_media_domains=[ # Custom social media domains + exclude_social_media_domains=[ # Custom social media domains "facebook.com", "twitter.com", "instagram.com" ], exclude_domains=["ads.example.com"] # Exclude specific domains ) +result = await crawler.arun(url="https://example.com", config=config) ``` ## Metadata Extraction @@ -173,7 +120,10 @@ result = await crawler.arun( Crawl4AI automatically extracts and processes page metadata, providing valuable information about the content: ```python -result = await crawler.arun(url="https://example.com") +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=config) metadata = result.metadata print(f"Title: {metadata['title']}") @@ -184,40 +134,3 @@ print(f"Published Date: {metadata['published_date']}") print(f"Modified Date: {metadata['modified_date']}") print(f"Language: {metadata['language']}") ``` - -## Best Practices - -1. **Use Fit Markdown for Articles** - ```python - # Perfect for blog posts, news articles, documentation - content = result.fit_markdown - ``` - -2. **Handle Media Appropriately** - ```python - # Filter by relevance score - relevant_images = [ - img for img in result.media["images"] - if img['score'] > 5 - ] - ``` - -3. **Combine Link Analysis with Content** - ```python - # Get content links with context - content_links = [ - link for link in result.links["internal"] - if link['type'] == 'content' - ] - ``` - -4. **Clean Content with Purpose** - ```python - # Customize cleaning based on your needs - result = await crawler.arun( - url=url, - word_count_threshold=20, # Adjust based on content type - keep_data_attributes=False, # Remove data attributes - process_iframes=True # Include iframe content - ) - ``` \ No newline at end of file diff --git a/docs/md_v2/advanced/hooks-auth.md b/docs/md_v2/advanced/hooks-auth.md index 8da3a1cc..66042229 100644 --- a/docs/md_v2/advanced/hooks-auth.md +++ b/docs/md_v2/advanced/hooks-auth.md @@ -1,114 +1,121 @@ # Hooks & Auth for AsyncWebCrawler -Crawl4AI's AsyncWebCrawler allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions that are called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This example demonstrates how to use various hooks to customize the asynchronous crawling process. +Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`. ## Example: Using Crawler Hooks with AsyncWebCrawler -Let's see how we can customize the AsyncWebCrawler using hooks! In this example, we'll: +In this example, we'll: -1. Configure the browser when it's created. -2. Add custom headers before navigating to the URL. -3. Log the current URL after navigation. -4. Perform actions after JavaScript execution. -5. Log the length of the HTML before returning it. +1. Configure the browser and set up authentication when it's created. +2. Apply custom routing and initial actions when the page context is created. +3. Add custom headers before navigating to the URL. +4. Log the current URL after navigation. +5. Perform actions after JavaScript execution. +6. Log the length of the HTML before returning it. ### Hook Definitions ```python import asyncio from crawl4ai import AsyncWebCrawler -from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from playwright.async_api import Page, Browser, BrowserContext -async def on_browser_created(browser: Browser): +def log_routing(route): + # Example: block loading images + if route.request.resource_type == "image": + print(f"[HOOK] Blocking image request: {route.request.url}") + asyncio.create_task(route.abort()) + else: + asyncio.create_task(route.continue_()) + +async def on_browser_created(browser: Browser, **kwargs): print("[HOOK] on_browser_created") - # Example customization: set browser viewport size - context = await browser.new_context(viewport={'width': 1920, 'height': 1080}) + # Example: Set browser viewport size and log in + context = await browser.new_context(viewport={"width": 1920, "height": 1080}) page = await context.new_page() - - # Example customization: logging in to a hypothetical website - await page.goto('https://example.com/login') - await page.fill('input[name="username"]', 'testuser') - await page.fill('input[name="password"]', 'password123') - await page.click('button[type="submit"]') - await page.wait_for_selector('#welcome') - - # Add a custom cookie - await context.add_cookies([{'name': 'test_cookie', 'value': 'cookie_value', 'url': 'https://example.com'}]) - + await page.goto("https://example.com/login") + await page.fill("input[name='username']", "testuser") + await page.fill("input[name='password']", "password123") + await page.click("button[type='submit']") + await page.wait_for_selector("#welcome") + await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}]) await page.close() await context.close() -async def before_goto(page: Page): - print("[HOOK] before_goto") - # Example customization: add custom headers - await page.set_extra_http_headers({'X-Test-Header': 'test'}) +async def on_page_context_created(context: BrowserContext, page: Page, **kwargs): + print("[HOOK] on_page_context_created") + await context.route("**", log_routing) -async def after_goto(page: Page): +async def before_goto(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] before_goto") + await page.set_extra_http_headers({"X-Test-Header": "test"}) + +async def after_goto(page: Page, context: BrowserContext, **kwargs): print("[HOOK] after_goto") - # Example customization: log the URL print(f"Current URL: {page.url}") -async def on_execution_started(page: Page): +async def on_execution_started(page: Page, context: BrowserContext, **kwargs): print("[HOOK] on_execution_started") - # Example customization: perform actions after JS execution await page.evaluate("console.log('Custom JS executed')") -async def before_return_html(page: Page, html: str): +async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs): print("[HOOK] before_return_html") - # Example customization: log the HTML length print(f"HTML length: {len(html)}") return page ``` -### Using the Hooks with the AsyncWebCrawler +### Using the Hooks with AsyncWebCrawler ```python -import asyncio -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy - async def main(): - print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!") - - initial_cookies = [ - {"name": "sessionId", "value": "abc123", "domain": ".example.com"}, - {"name": "userId", "value": "12345", "domain": ".example.com"} - ] - crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True, cookies=initial_cookies) - crawler_strategy.set_hook('on_browser_created', on_browser_created) - crawler_strategy.set_hook('before_goto', before_goto) - crawler_strategy.set_hook('after_goto', after_goto) - crawler_strategy.set_hook('on_execution_started', on_execution_started) - crawler_strategy.set_hook('before_return_html', before_return_html) - - async with AsyncWebCrawler(verbose=True, crawler_strategy=crawler_strategy) as crawler: - result = await crawler.arun( - url="https://example.com", - js_code="window.scrollTo(0, document.body.scrollHeight);", - wait_for="footer" - ) + print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!") - print("📦 Crawler Hooks result:") + # Configure browser and crawler settings + browser_config = BrowserConfig( + headless=True, + viewport_width=1920, + viewport_height=1080 + ) + + crawler_run_config = CrawlerRunConfig( + js_code="window.scrollTo(0, document.body.scrollHeight);", + wait_for="footer" + ) + + # Initialize crawler + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) + crawler.crawler_strategy.set_hook("before_return_html", before_return_html) + + # Run the crawler + result = await crawler.arun(url="https://example.com", config=crawler_run_config) + + print("\n📦 Crawler Hooks Result:") print(result) asyncio.run(main()) ``` -### Explanation +### Explanation of Hooks -- `on_browser_created`: This hook is called when the Playwright browser is created. It sets up the browser context, logs in to a website, and adds a custom cookie. -- `before_goto`: This hook is called right before Playwright navigates to the URL. It adds custom HTTP headers. -- `after_goto`: This hook is called after Playwright navigates to the URL. It logs the current URL. -- `on_execution_started`: This hook is called after any custom JavaScript is executed. It performs additional JavaScript actions. -- `before_return_html`: This hook is called before returning the HTML content. It logs the length of the HTML content. +- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies). +- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL. +- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions. +- **`after_goto`**: Called after navigation. Use this to verify content or log the URL. +- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions. +- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content. -### Additional Ideas +### Additional Customizations -- **Handling authentication**: Use the `on_browser_created` hook to handle login processes or set authentication tokens. -- **Dynamic header modification**: Modify headers based on the target URL or other conditions in the `before_goto` hook. -- **Content verification**: Use the `after_goto` hook to verify that the expected content is present on the page. -- **Custom JavaScript injection**: Inject and execute custom JavaScript using the `on_execution_started` hook. -- **Content preprocessing**: Modify or analyze the HTML content in the `before_return_html` hook before it's returned. +- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts). +- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL. +- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens. +- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content. + +These hooks provide powerful customization options for tailoring the crawling process to your needs. -By using these hooks, you can customize the behavior of the AsyncWebCrawler to suit your specific needs, including handling authentication, modifying requests, and preprocessing content. \ No newline at end of file diff --git a/docs/md_v2/advanced/identity_based_crawling.md b/docs/md_v2/advanced/identity_based_crawling.md new file mode 100644 index 00000000..c0ab7fd5 --- /dev/null +++ b/docs/md_v2/advanced/identity_based_crawling.md @@ -0,0 +1,156 @@ +### Preserve Your Identity with Crawl4AI + +Crawl4AI empowers you to navigate and interact with the web using your authentic digital identity, ensuring that you are recognized as a human and not mistaken for a bot. This document introduces Managed Browsers, the recommended approach for preserving your rights to access the web, and Magic Mode, a simplified solution for specific scenarios. + +--- + +### Managed Browsers: Your Digital Identity Solution + +**Managed Browsers** enable developers to create and use persistent browser profiles. These profiles store local storage, cookies, and other session-related data, allowing you to interact with websites as a recognized user. By leveraging your unique identity, Managed Browsers ensure that your experience reflects your rights as a human browsing the web. + +#### Why Use Managed Browsers? +1. **Authentic Browsing Experience**: Managed Browsers retain session data and browser fingerprints, mirroring genuine user behavior. +2. **Effortless Configuration**: Once you interact with the site using the browser (e.g., solving a CAPTCHA), the session data is saved and reused, providing seamless access. +3. **Empowered Data Access**: By using your identity, Managed Browsers empower users to access data they can view on their own screens without artificial restrictions. + +#### Steps to Use Managed Browsers + +1. **Setup the Browser Configuration**: + ```python + from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + browser_config = BrowserConfig( + headless=False, # Set to False for initial setup to view browser actions + verbose=True, + user_agent_mode="random", + use_managed_browser=True, # Enables persistent browser sessions + browser_type="chromium", + user_data_dir="/path/to/user_profile_data" # Path to save session data + ) + ``` + +2. **Perform an Initial Run**: + - Run the crawler with `headless=False`. + - Manually interact with the site (e.g., solve CAPTCHA or log in). + - The browser session saves cookies, local storage, and other required data. + +3. **Subsequent Runs**: + - Switch to `headless=True` for automation. + - The session data is reused, allowing seamless crawling. + +#### Example: Extracting Data Using Managed Browsers + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def main(): + # Define schema for structured data extraction + schema = { + "name": "Example Data", + "baseSelector": "div.example", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + + # Configure crawler + browser_config = BrowserConfig( + headless=True, # Automate subsequent runs + verbose=True, + use_managed_browser=True, + user_data_dir="/path/to/user_profile_data" + ) + + crawl_config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy(schema), + wait_for="css:div.example" # Wait for the targeted element to load + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=crawl_config + ) + + if result.success: + print("Extracted Data:", result.extracted_content) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Benefits of Managed Browsers Over Other Methods +Managed Browsers eliminate the need for manual detection workarounds by enabling developers to work directly with their identity and user profile data. This approach ensures maximum compatibility with websites and simplifies the crawling process while preserving your right to access data freely. + +--- + +### Magic Mode: Simplified Automation + +While Managed Browsers are the preferred approach, **Magic Mode** provides an alternative for scenarios where persistent user profiles are unnecessary or infeasible. Magic Mode automates user-like behavior and simplifies configuration. + +#### What Magic Mode Does: +- Simulates human browsing by randomizing interaction patterns and timing. +- Masks browser automation signals. +- Handles cookie popups and modals. +- Modifies navigator properties for enhanced compatibility. + +#### Using Magic Mode + +```python +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + magic=True # Enables all automation features + ) +``` + +Magic Mode is particularly useful for: +- Quick prototyping when a Managed Browser setup is not available. +- Basic sites requiring minimal interaction or configuration. + +#### Example: Combining Magic Mode with Additional Options + +```python +async def crawl_with_magic_mode(url: str): + async with AsyncWebCrawler(headless=True) as crawler: + result = await crawler.arun( + url=url, + magic=True, + remove_overlay_elements=True, # Remove popups/modals + page_timeout=60000 # Increased timeout for complex pages + ) + + return result.markdown if result.success else None +``` + +### Magic Mode vs. Managed Browsers +While Magic Mode simplifies many tasks, it cannot match the reliability and authenticity of Managed Browsers. By using your identity and persistent profiles, Managed Browsers render Magic Mode largely unnecessary. However, Magic Mode remains a viable fallback for specific situations where user identity is not a factor. + +--- + +### Key Comparison: Managed Browsers vs. Magic Mode + +| Feature | **Managed Browsers** | **Magic Mode** | +|-------------------------|------------------------------------------|-------------------------------------| +| **Session Persistence** | Retains cookies and local storage. | No session retention. | +| **Human Interaction** | Uses real user profiles and data. | Simulates human-like patterns. | +| **Complex Sites** | Best suited for heavily configured sites.| Works well with simpler challenges.| +| **Setup Complexity** | Requires initial manual interaction. | Fully automated, one-line setup. | + +#### Recommendation: +- Use **Managed Browsers** for reliable, session-based crawling and data extraction. +- Use **Magic Mode** for quick prototyping or when persistent profiles are not required. + +--- + +### Conclusion + +- **Use Managed Browsers** to preserve your digital identity and ensure reliable, identity-based crawling with persistent sessions. This approach works seamlessly for even the most complex websites. +- **Leverage Magic Mode** for quick automation or in scenarios where persistent user profiles are not needed. + +By combining these approaches, Crawl4AI provides unparalleled flexibility and capability for your crawling needs. + diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md index 0d327f2e..bbe07f2f 100644 --- a/docs/md_v2/advanced/managed_browser.md +++ b/docs/md_v2/advanced/managed_browser.md @@ -1,136 +1,188 @@ -# Content Filtering in Crawl4AI +# Creating Browser Instances, Contexts, and Pages -This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. +## 1 Introduction -## Relevance Content Filter +### Overview of Browser Management in Crawl4AI +Crawl4AI's browser management system is designed to provide developers with advanced tools for handling complex web crawling tasks. By managing browser instances, contexts, and pages, Crawl4AI ensures optimal performance, anti-bot measures, and session persistence for high-volume, dynamic web crawling. -The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. +### Key Objectives +- **Anti-Bot Handling**: + - Implements stealth techniques to evade detection mechanisms used by modern websites. + - Simulates human-like behavior, such as mouse movements, scrolling, and key presses. + - Supports integration with third-party services to bypass CAPTCHA challenges. +- **Persistent Sessions**: + - Retains session data (cookies, local storage) for workflows requiring user authentication. + - Allows seamless continuation of tasks across multiple runs without re-authentication. +- **Scalable Crawling**: + - Optimized resource utilization for handling thousands of URLs concurrently. + - Flexible configuration options to tailor crawling behavior to specific requirements. +--- -## Pruning Content Filter +## 2 Browser Creation Methods -The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold. +### Standard Browser Creation +Standard browser creation initializes a browser instance with default or minimal configurations. It is suitable for tasks that do not require session persistence or heavy customization. -### Usage +#### Features and Limitations +- **Features**: + - Quick and straightforward setup for small-scale tasks. + - Supports headless and headful modes. +- **Limitations**: + - Lacks advanced customization options like session reuse. + - May struggle with sites employing strict anti-bot measures. +#### Example Usage ```python -from crawl4ai import AsyncWebCrawler -from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai import AsyncWebCrawler, BrowserConfig -async def filter_content(url): - async with AsyncWebCrawler() as crawler: - content_filter = PruningContentFilter( - min_word_threshold=5, - threshold_type='dynamic', - threshold=0.45 - ) - result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) - if result.success: - print(f"Cleaned Markdown:\n{result.fit_markdown}") +browser_config = BrowserConfig(browser_type="chromium", headless=True) +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://crawl4ai.com") + print(result.markdown) ``` -### Parameters +### Persistent Contexts +Persistent contexts create browser sessions with stored data, enabling workflows that require maintaining login states or other session-specific information. -- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. - -- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: - - `'fixed'`: Uses a constant threshold value for all nodes - - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios - -- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning: - - For fixed threshold: Nodes scoring below this value are removed - - For dynamic threshold: This value is adjusted based on node properties - -### How It Works - -The pruning algorithm evaluates each node using multiple metrics: -- Text density: Ratio of actual text to overall node content -- Link density: Proportion of text within links -- Tag importance: Weight based on HTML tag type (e.g., article, p, div) -- Content quality: Metrics like text length and structural importance - -Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks. - -The algorithm is particularly effective for: -- Removing boilerplate content -- Eliminating navigation menus and sidebars -- Preserving main article content -- Maintaining document structure while removing noise - - -## BM25 Algorithm - -The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. - -### Usage - -To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. +#### Benefits of Using `user_data_dir` +- **Session Persistence**: + - Stores cookies, local storage, and cache between crawling sessions. + - Reduces overhead for repetitive logins or multi-step workflows. +- **Enhanced Performance**: + - Leverages pre-loaded resources for faster page loading. +- **Flexibility**: + - Adapts to complex workflows requiring user-specific configurations. +#### Example: Setting Up Persistent Contexts ```python -from crawl4ai import AsyncWebCrawler -from crawl4ai.content_filter_strategy import BM25ContentFilter - -async def filter_content(url, query=None): - async with AsyncWebCrawler() as crawler: - content_filter = BM25ContentFilter(user_query=query) - result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering - if result.success: - print(f"Filtered Content (JSON):\n{result.extracted_content}") - print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object - print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. - else: - print("Error:", result.error_message) - -# Example usage: -asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query -asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. - +config = BrowserConfig(user_data_dir="/path/to/user/data") +async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun("https://crawl4ai.com") + print(result.markdown) ``` -### Parameters +### Managed Browser +The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures. -- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. -- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. +#### How It Works +- **Browser Process Management**: + - Automates initialization and cleanup of browser processes. + - Optimizes resource usage by pooling and reusing browser instances. +- **Debugging Support**: + - Integrates with debugging tools like Chrome Developer Tools for real-time inspection. +- **Anti-Bot Measures**: + - Implements stealth plugins to mimic real user behavior and bypass bot detection. +#### Features +- **Customizable Configurations**: + - Supports advanced options such as viewport resizing, proxy settings, and header manipulation. +- **Debugging and Logging**: + - Logs detailed browser interactions for debugging and performance analysis. +- **Scalability**: + - Handles multiple browser instances concurrently, scaling dynamically based on workload. -## Fit Markdown Flag - -Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. - - -## Custom Content Filtering Strategies - -You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. - +#### Example: Using `ManagedBrowser` ```python -from crawl4ai.content_filter_strategy import RelevantContentFilter -from bs4 import BeautifulSoup, Tag -from typing import List - -class MyCustomFilter(RelevantContentFilter): - def filter_content(self, html: str) -> List[str]: - soup = BeautifulSoup(html, 'lxml') - # Implement custom filtering logic here - # Example: extract all paragraphs within divs with class "article-body" - filtered_paragraphs = [] - for tag in soup.select("div.article-body p"): - if isinstance(tag, Tag): - filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. - return filtered_paragraphs - - - -async def custom_filter_demo(url: str): - async with AsyncWebCrawler() as crawler: - custom_filter = MyCustomFilter() - result = await crawler.arun(url, extraction_strategy=custom_filter) - if result.success: - print(result.extracted_content) +from crawl4ai import AsyncWebCrawler, BrowserConfig +config = BrowserConfig(headless=False, debug_port=9222) +async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun("https://crawl4ai.com") + print(result.markdown) ``` -This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. +--- -## Conclusion +## 3 Context and Page Management -Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. +### Creating and Configuring Browser Contexts +Browser contexts act as isolated environments within a single browser instance, enabling independent browsing sessions with their own cookies, cache, and storage. + +#### Customizations +- **Headers and Cookies**: + - Define custom headers to mimic specific devices or browsers. + - Set cookies for authenticated sessions. +- **Session Reuse**: + - Retain and reuse session data across multiple requests. + - Example: Preserve login states for authenticated crawls. + +#### Example: Context Initialization +```python +from crawl4ai import CrawlerRunConfig + +config = CrawlerRunConfig(headers={"User-Agent": "Crawl4AI/1.0"}) +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://crawl4ai.com", config=config) + print(result.markdown) +``` + +### Creating Pages +Pages represent individual tabs or views within a browser context. They are responsible for rendering content, executing JavaScript, and handling user interactions. + +#### Key Features +- **IFrame Handling**: + - Extract content from embedded iframes. + - Navigate and interact with nested content. +- **Viewport Customization**: + - Adjust viewport size to match target device dimensions. +- **Lazy Loading**: + - Ensure dynamic elements are fully loaded before extraction. + +#### Example: Page Initialization +```python +config = CrawlerRunConfig(viewport_width=1920, viewport_height=1080) +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://crawl4ai.com", config=config) + print(result.markdown) +``` + +--- + +## 4 Advanced Features and Best Practices + +### Debugging and Logging +Remote debugging provides a powerful way to troubleshoot complex crawling workflows. + +#### Example: Enabling Remote Debugging +```python +config = BrowserConfig(debug_port=9222) +async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun("https://crawl4ai.com") +``` + +### Anti-Bot Techniques +- **Human Behavior Simulation**: + - Mimic real user actions, such as scrolling, clicking, and typing. + - Example: Use JavaScript to simulate interactions. +- **Captcha Handling**: + - Integrate with third-party services like 2Captcha or AntiCaptcha for automated solving. + +#### Example: Simulating User Actions +```python +js_code = """ +(async () => { + document.querySelector('input[name="search"]').value = 'test'; + document.querySelector('button[type="submit"]').click(); +})(); +""" +config = CrawlerRunConfig(js_code=[js_code]) +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://crawl4ai.com", config=config) +``` + +### Optimizations for Performance and Scalability +- **Persistent Contexts**: + - Reuse browser contexts to minimize resource consumption. +- **Concurrent Crawls**: + - Use `arun_many` with a controlled semaphore count for efficient batch processing. + +#### Example: Scaling Crawls +```python +urls = ["https://example1.com", "https://example2.com"] +config = CrawlerRunConfig(semaphore_count=10) +async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many(urls, config=config) + for result in results: + print(result.url, result.markdown) +``` diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index c7602531..8989777b 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -4,59 +4,67 @@ Configure proxy settings and enhance security features in Crawl4AI for reliable ## Basic Proxy Setup -Simple proxy configuration: +Simple proxy configuration with `BrowserConfig`: ```python +from crawl4ai.async_configs import BrowserConfig + # Using proxy URL -async with AsyncWebCrawler( - proxy="http://proxy.example.com:8080" -) as crawler: +browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") # Using SOCKS proxy -async with AsyncWebCrawler( - proxy="socks5://proxy.example.com:1080" -) as crawler: +browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080") +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` ## Authenticated Proxy -Use proxy with authentication: +Use an authenticated proxy with `BrowserConfig`: ```python +from crawl4ai.async_configs import BrowserConfig + proxy_config = { "server": "http://proxy.example.com:8080", "username": "user", "password": "pass" } -async with AsyncWebCrawler(proxy_config=proxy_config) as crawler: +browser_config = BrowserConfig(proxy_config=proxy_config) +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` ## Rotating Proxies -Example using a proxy rotation service: +Example using a proxy rotation service and updating `BrowserConfig` dynamically: ```python +from crawl4ai.async_configs import BrowserConfig + async def get_next_proxy(): # Your proxy rotation logic here return {"server": "http://next.proxy.com:8080"} -async with AsyncWebCrawler() as crawler: +browser_config = BrowserConfig() +async with AsyncWebCrawler(config=browser_config) as crawler: # Update proxy for each request for url in urls: proxy = await get_next_proxy() - crawler.update_proxy(proxy) - result = await crawler.arun(url=url) + browser_config.proxy_config = proxy + result = await crawler.arun(url=url, config=browser_config) ``` ## Custom Headers -Add security-related headers: +Add security-related headers via `BrowserConfig`: ```python +from crawl4ai.async_configs import BrowserConfig + headers = { "X-Forwarded-For": "203.0.113.195", "Accept-Language": "en-US,en;q=0.9", @@ -64,21 +72,24 @@ headers = { "Pragma": "no-cache" } -async with AsyncWebCrawler(headers=headers) as crawler: +browser_config = BrowserConfig(headers=headers) +async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` ## Combining with Magic Mode -For maximum protection, combine proxy with Magic Mode: +For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`: ```python -async with AsyncWebCrawler( +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +browser_config = BrowserConfig( proxy="http://proxy.example.com:8080", headers={"Accept-Language": "en-US"} -) as crawler: - result = await crawler.arun( - url="https://example.com", - magic=True # Enable all anti-detection features - ) -``` \ No newline at end of file +) +crawler_config = CrawlerRunConfig(magic=True) # Enable all anti-detection features + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=crawler_config) +``` diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md index 908828f7..ba1ae0a0 100644 --- a/docs/md_v2/advanced/session-management-advanced.md +++ b/docs/md_v2/advanced/session-management-advanced.md @@ -1,44 +1,53 @@ -# Session-Based Crawling for Dynamic Content +### Session-Based Crawling for Dynamic Content -In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. To effectively crawl such websites, Crawl4AI provides powerful session-based crawling capabilities. +In modern web applications, content is often loaded dynamically without changing the URL. Examples include "Load More" buttons, infinite scrolling, or paginated content that updates via JavaScript. Crawl4AI provides session-based crawling capabilities to handle such scenarios effectively. -This guide will explore advanced techniques for crawling dynamic content using Crawl4AI's session management features. +This guide explores advanced techniques for crawling dynamic content using Crawl4AI's session management features. + +--- ## Understanding Session-Based Crawling -Session-based crawling allows you to maintain a persistent browser session across multiple requests. This is crucial when: +Session-based crawling allows you to reuse a persistent browser session across multiple actions. This means the same browser tab (or page object) is used throughout, enabling: -1. The content changes dynamically without URL changes -2. You need to interact with the page (e.g., clicking buttons) between requests -3. The site requires authentication or maintains state across pages +1. **Efficient handling of dynamic content** without reloading the page. +2. **JavaScript actions before and after crawling** (e.g., clicking buttons or scrolling). +3. **State maintenance** for authenticated sessions or multi-step workflows. +4. **Faster sequential crawling**, as it avoids reopening tabs or reallocating resources. -Crawl4AI's `AsyncWebCrawler` class supports session-based crawling through the `session_id` parameter and related methods. +**Note:** Session-based crawling is ideal for sequential operations, not parallel tasks. + +--- ## Basic Concepts -Before diving into examples, let's review some key concepts: +Before diving into examples, here are some key concepts: -- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple `arun` calls to maintain state. -- **JavaScript Execution**: Use the `js_code` parameter to execute JavaScript on the page, such as clicking a "Load More" button. -- **CSS Selectors**: Use these to target specific elements for extraction or interaction. -- **Extraction Strategy**: Define how to extract structured data from the page. -- **Wait Conditions**: Specify conditions to wait for before considering the page loaded. +- **Session ID**: A unique identifier for a browsing session. Use the same `session_id` across multiple requests to maintain state. +- **BrowserConfig & CrawlerRunConfig**: These configuration objects control browser settings and crawling behavior. +- **JavaScript Execution**: Use `js_code` to perform actions like clicking buttons. +- **CSS Selectors**: Target specific elements for interaction or data extraction. +- **Extraction Strategy**: Define rules to extract structured data. +- **Wait Conditions**: Specify conditions to wait for before proceeding. + +--- ## Example 1: Basic Session-Based Crawling -Let's start with a basic example of session-based crawling: +A simple example using session-based crawling: ```python import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig +from crawl4ai.cache_context import CacheMode async def basic_session_crawl(): - async with AsyncWebCrawler(verbose=True) as crawler: - session_id = "my_session" + async with AsyncWebCrawler() as crawler: + session_id = "dynamic_content_session" url = "https://example.com/dynamic-content" for page in range(3): - result = await crawler.arun( + config = CrawlerRunConfig( url=url, session_id=session_id, js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, @@ -46,6 +55,7 @@ async def basic_session_crawl(): cache_mode=CacheMode.BYPASS ) + result = await crawler.arun(config=config) print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items") await crawler.crawler_strategy.kill_session(session_id) @@ -53,17 +63,16 @@ async def basic_session_crawl(): asyncio.run(basic_session_crawl()) ``` -This example demonstrates: -1. Using a consistent `session_id` across multiple `arun` calls -2. Executing JavaScript to load more content after the first page -3. Using a CSS selector to extract specific content -4. Properly closing the session after crawling +This example shows: +1. Reusing the same `session_id` across multiple requests. +2. Executing JavaScript to load more content dynamically. +3. Properly closing the session to free resources. + +--- ## Advanced Technique 1: Custom Execution Hooks -Crawl4AI allows you to set custom hooks that execute at different stages of the crawling process. This is particularly useful for handling complex loading scenarios. - -Here's an example that waits for new content to appear before proceeding: +Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically: ```python async def advanced_session_crawl_with_hooks(): @@ -75,202 +84,96 @@ async def advanced_session_crawl_with_hooks(): while True: await page.wait_for_selector("li.commit-item h4") commit = await page.query_selector("li.commit-item h4") - commit = await commit.evaluate("(element) => element.textContent") - commit = commit.strip() + commit = await commit.evaluate("(element) => element.textContent").strip() if commit and commit != first_commit: first_commit = commit break await asyncio.sleep(0.5) except Exception as e: - print(f"Warning: New content didn't appear after JavaScript execution: {e}") + print(f"Warning: New content didn't appear: {e}") - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler() as crawler: + session_id = "commit_session" + url = "https://github.com/example/repo/commits/main" crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) - url = "https://github.com/example/repo/commits/main" - session_id = "commit_session" - all_commits = [] - - js_next_page = """ - const button = document.querySelector('a.pagination-next'); - if (button) button.click(); - """ + js_next_page = """document.querySelector('a.pagination-next').click();""" for page in range(3): - result = await crawler.arun( + config = CrawlerRunConfig( url=url, session_id=session_id, - css_selector="li.commit-item", js_code=js_next_page if page > 0 else None, - cache_mode=CacheMode.BYPASS, - js_only=page > 0 + css_selector="li.commit-item", + js_only=page > 0, + cache_mode=CacheMode.BYPASS ) - commits = result.extracted_content.select("li.commit-item") - all_commits.extend(commits) - print(f"Page {page + 1}: Found {len(commits)} commits") + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") await crawler.crawler_strategy.kill_session(session_id) - print(f"Successfully crawled {len(all_commits)} commits across 3 pages") asyncio.run(advanced_session_crawl_with_hooks()) ``` -This technique uses a custom `on_execution_started` hook to ensure new content has loaded before proceeding to the next step. +This technique ensures new content loads before the next action. + +--- ## Advanced Technique 2: Integrated JavaScript Execution and Waiting -Instead of using separate hooks, you can integrate the waiting logic directly into your JavaScript execution. This approach can be more concise and easier to manage for some scenarios. - -Here's an example: +Combine JavaScript execution and waiting logic for concise handling of dynamic content: ```python async def integrated_js_and_wait_crawl(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://github.com/example/repo/commits/main" + async with AsyncWebCrawler() as crawler: session_id = "integrated_session" - all_commits = [] + url = "https://github.com/example/repo/commits/main" js_next_page_and_wait = """ (async () => { - const getCurrentCommit = () => { - const commits = document.querySelectorAll('li.commit-item h4'); - return commits.length > 0 ? commits[0].textContent.trim() : null; - }; - + const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim(); const initialCommit = getCurrentCommit(); - const button = document.querySelector('a.pagination-next'); - if (button) button.click(); - - while (true) { + document.querySelector('a.pagination-next').click(); + while (getCurrentCommit() === initialCommit) { await new Promise(resolve => setTimeout(resolve, 100)); - const newCommit = getCurrentCommit(); - if (newCommit && newCommit !== initialCommit) { - break; - } } })(); """ - schema = { - "name": "Commit Extractor", - "baseSelector": "li.commit-item", - "fields": [ - { - "name": "title", - "selector": "h4.commit-title", - "type": "text", - "transform": "strip", - }, - ], - } - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - for page in range(3): - result = await crawler.arun( + config = CrawlerRunConfig( url=url, session_id=session_id, - css_selector="li.commit-item", - extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, + css_selector="li.commit-item", js_only=page > 0, cache_mode=CacheMode.BYPASS ) - commits = json.loads(result.extracted_content) - all_commits.extend(commits) - print(f"Page {page + 1}: Found {len(commits)} commits") + result = await crawler.arun(config=config) + print(f"Page {page + 1}: Found {len(result.extracted_content)} commits") await crawler.crawler_strategy.kill_session(session_id) - print(f"Successfully crawled {len(all_commits)} commits across 3 pages") asyncio.run(integrated_js_and_wait_crawl()) ``` -This approach combines the JavaScript for clicking the "next" button and waiting for new content to load into a single script. - -## Advanced Technique 3: Using the `wait_for` Parameter - -Crawl4AI provides a `wait_for` parameter that allows you to specify a condition to wait for before considering the page fully loaded. This can be particularly useful for dynamic content. - -Here's an example: - -```python -async def wait_for_parameter_crawl(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://github.com/example/repo/commits/main" - session_id = "wait_for_session" - all_commits = [] - - js_next_page = """ - const commits = document.querySelectorAll('li.commit-item h4'); - if (commits.length > 0) { - window.lastCommit = commits[0].textContent.trim(); - } - const button = document.querySelector('a.pagination-next'); - if (button) button.click(); - """ - - wait_for = """() => { - const commits = document.querySelectorAll('li.commit-item h4'); - if (commits.length === 0) return false; - const firstCommit = commits[0].textContent.trim(); - return firstCommit !== window.lastCommit; - }""" - - schema = { - "name": "Commit Extractor", - "baseSelector": "li.commit-item", - "fields": [ - { - "name": "title", - "selector": "h4.commit-title", - "type": "text", - "transform": "strip", - }, - ], - } - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - - for page in range(3): - result = await crawler.arun( - url=url, - session_id=session_id, - css_selector="li.commit-item", - extraction_strategy=extraction_strategy, - js_code=js_next_page if page > 0 else None, - wait_for=wait_for if page > 0 else None, - js_only=page > 0, - cache_mode=CacheMode.BYPASS - ) - - commits = json.loads(result.extracted_content) - all_commits.extend(commits) - print(f"Page {page + 1}: Found {len(commits)} commits") - - await crawler.crawler_strategy.kill_session(session_id) - print(f"Successfully crawled {len(all_commits)} commits across 3 pages") - -asyncio.run(wait_for_parameter_crawl()) -``` - -This technique separates the JavaScript execution (clicking the "next" button) from the waiting condition, providing more flexibility and clarity in some scenarios. +--- ## Best Practices for Session-Based Crawling -1. **Use Unique Session IDs**: Ensure each crawling session has a unique `session_id` to prevent conflicts. -2. **Close Sessions**: Always close sessions using `kill_session` when you're done to free up resources. -3. **Handle Errors**: Implement proper error handling to deal with unexpected situations during crawling. -4. **Respect Website Terms**: Ensure your crawling adheres to the website's terms of service and robots.txt file. -5. **Implement Delays**: Add appropriate delays between requests to avoid overwhelming the target server. -6. **Use Extraction Strategies**: Leverage `JsonCssExtractionStrategy` or other extraction strategies for structured data extraction. -7. **Optimize JavaScript**: Keep your JavaScript execution concise and efficient to improve crawling speed. -8. **Monitor Performance**: Keep an eye on memory usage and crawling speed, especially for long-running sessions. +1. **Unique Session IDs**: Assign descriptive and unique `session_id` values. +2. **Close Sessions**: Always clean up sessions with `kill_session` after use. +3. **Error Handling**: Anticipate and handle errors gracefully. +4. **Respect Websites**: Follow terms of service and robots.txt. +5. **Delays**: Add delays to avoid overwhelming servers. +6. **Optimize JavaScript**: Keep scripts concise for better performance. +7. **Monitor Resources**: Track memory and CPU usage for long sessions. + +--- ## Conclusion -Session-based crawling with Crawl4AI provides powerful capabilities for handling dynamic content and complex web applications. By leveraging session management, JavaScript execution, and waiting strategies, you can effectively crawl and extract data from a wide range of modern websites. - -Remember to use these techniques responsibly and in compliance with website policies and ethical web scraping practices. - -For more advanced usage and API details, refer to the Crawl4AI API documentation. \ No newline at end of file +Session-based crawling in Crawl4AI is a robust solution for handling dynamic content and multi-step workflows. By combining session management, JavaScript execution, and structured extraction strategies, you can effectively navigate and extract data from modern web applications. Always adhere to ethical web scraping practices and respect website policies. \ No newline at end of file diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index eae4cf7b..e9348223 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -1,74 +1,70 @@ -# Session Management +### Session Management -Session management in Crawl4AI allows you to maintain state across multiple requests and handle complex multi-page crawling tasks, particularly useful for dynamic websites. +Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for: -## Basic Session Usage +- **Performing JavaScript actions before and after crawling.** +- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly. -Use `session_id` to maintain state between requests: +**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations. + +--- + +#### Basic Session Usage + +Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`: ```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + async with AsyncWebCrawler() as crawler: session_id = "my_session" - + + # Define configurations + config1 = CrawlerRunConfig(url="https://example.com/page1", session_id=session_id) + config2 = CrawlerRunConfig(url="https://example.com/page2", session_id=session_id) + # First request - result1 = await crawler.arun( - url="https://example.com/page1", - session_id=session_id - ) - - # Subsequent request using same session - result2 = await crawler.arun( - url="https://example.com/page2", - session_id=session_id - ) - + result1 = await crawler.arun(config=config1) + + # Subsequent request using the same session + result2 = await crawler.arun(config=config2) + # Clean up when done await crawler.crawler_strategy.kill_session(session_id) ``` -## Dynamic Content with Sessions +--- -Here's a real-world example of crawling GitHub commits across multiple pages: +#### Dynamic Content with Sessions + +Here's an example of crawling GitHub commits across multiple pages while preserving session state: ```python +from crawl4ai.async_configs import CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.cache_context import CacheMode + async def crawl_dynamic_content(): - async with AsyncWebCrawler(verbose=True) as crawler: + async with AsyncWebCrawler() as crawler: + session_id = "github_commits_session" url = "https://github.com/microsoft/TypeScript/commits/main" - session_id = "typescript_commits_session" all_commits = [] - # Define navigation JavaScript - js_next_page = """ - const button = document.querySelector('a[data-testid="pagination-next-button"]'); - if (button) button.click(); - """ - - # Define wait condition - wait_for = """() => { - const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); - if (commits.length === 0) return false; - const firstCommit = commits[0].textContent.trim(); - return firstCommit !== window.firstCommit; - }""" - # Define extraction schema schema = { "name": "Commit Extractor", "baseSelector": "li.Box-sc-g0xbh4-0", - "fields": [ - { - "name": "title", - "selector": "h4.markdown-title", - "type": "text", - "transform": "strip", - }, - ], + "fields": [{"name": "title", "selector": "h4.markdown-title", "type": "text"}], } extraction_strategy = JsonCssExtractionStrategy(schema) + # JavaScript and wait configurations + js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();""" + wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0""" + # Crawl multiple pages for page in range(3): - result = await crawler.arun( + config = CrawlerRunConfig( url=url, session_id=session_id, extraction_strategy=extraction_strategy, @@ -78,6 +74,7 @@ async def crawl_dynamic_content(): cache_mode=CacheMode.BYPASS ) + result = await crawler.arun(config=config) if result.success: commits = json.loads(result.extracted_content) all_commits.extend(commits) @@ -88,46 +85,53 @@ async def crawl_dynamic_content(): return all_commits ``` -## Session Best Practices +--- -1. **Session Naming**: -```python -# Use descriptive session IDs -session_id = "login_flow_session" -session_id = "product_catalog_session" -``` +#### Session Best Practices + +1. **Descriptive Session IDs**: + Use meaningful names for session IDs to organize workflows: + ```python + session_id = "login_flow_session" + session_id = "product_catalog_session" + ``` 2. **Resource Management**: -```python -try: - # Your crawling code - pass -finally: - # Always clean up sessions - await crawler.crawler_strategy.kill_session(session_id) -``` + Always ensure sessions are cleaned up to free resources: + ```python + try: + # Your crawling code here + pass + finally: + await crawler.crawler_strategy.kill_session(session_id) + ``` -3. **State Management**: -```python -# First page: login -result = await crawler.arun( - url="https://example.com/login", - session_id=session_id, - js_code="document.querySelector('form').submit();" -) +3. **State Maintenance**: + Reuse the session for subsequent actions within the same workflow: + ```python + # Step 1: Login + login_config = CrawlerRunConfig( + url="https://example.com/login", + session_id=session_id, + js_code="document.querySelector('form').submit();" + ) + await crawler.arun(config=login_config) -# Second page: verify login success -result = await crawler.arun( - url="https://example.com/dashboard", - session_id=session_id, - wait_for="css:.user-profile" # Wait for authenticated content -) -``` + # Step 2: Verify login success + dashboard_config = CrawlerRunConfig( + url="https://example.com/dashboard", + session_id=session_id, + wait_for="css:.user-profile" # Wait for authenticated content + ) + result = await crawler.arun(config=dashboard_config) + ``` -## Common Use Cases +--- -1. **Authentication Flows** -2. **Pagination Handling** -3. **Form Submissions** -4. **Multi-step Processes** -5. **Dynamic Content Navigation** +#### Common Use Cases for Sessions + +1. **Authentication Flows**: Login and interact with secured pages. +2. **Pagination Handling**: Navigate through multiple pages. +3. **Form Submissions**: Fill forms, submit, and process results. +4. **Multi-step Processes**: Complete workflows that span multiple actions. +5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content. diff --git a/docs/md_v2/api/crawl-config.md b/docs/md_v2/api/crawl-config.md new file mode 100644 index 00000000..928ae1e2 --- /dev/null +++ b/docs/md_v2/api/crawl-config.md @@ -0,0 +1,85 @@ +# CrawlerRunConfig Parameters Documentation + +## Content Processing Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `word_count_threshold` | int | 200 | Minimum word count threshold before processing content | +| `extraction_strategy` | ExtractionStrategy | None | Strategy to extract structured data from crawled pages. When None, uses NoExtractionStrategy | +| `chunking_strategy` | ChunkingStrategy | RegexChunking() | Strategy to chunk content before extraction | +| `markdown_generator` | MarkdownGenerationStrategy | None | Strategy for generating markdown from extracted content | +| `content_filter` | RelevantContentFilter | None | Optional filter to prune irrelevant content | +| `only_text` | bool | False | If True, attempt to extract text-only content where applicable | +| `css_selector` | str | None | CSS selector to extract a specific portion of the page | +| `excluded_tags` | list[str] | [] | List of HTML tags to exclude from processing | +| `keep_data_attributes` | bool | False | If True, retain `data-*` attributes while removing unwanted attributes | +| `remove_forms` | bool | False | If True, remove all `` elements from the HTML | +| `prettiify` | bool | False | If True, apply `fast_format_html` to produce prettified HTML output | + +## Caching Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `cache_mode` | CacheMode | None | Defines how caching is handled. Defaults to CacheMode.ENABLED internally | +| `session_id` | str | None | Optional session ID to persist browser context and page instance | +| `bypass_cache` | bool | False | Legacy parameter, if True acts like CacheMode.BYPASS | +| `disable_cache` | bool | False | Legacy parameter, if True acts like CacheMode.DISABLED | +| `no_cache_read` | bool | False | Legacy parameter, if True acts like CacheMode.WRITE_ONLY | +| `no_cache_write` | bool | False | Legacy parameter, if True acts like CacheMode.READ_ONLY | + +## Page Navigation and Timing Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `wait_until` | str | "domcontentloaded" | The condition to wait for when navigating | +| `page_timeout` | int | 60000 | Timeout in milliseconds for page operations like navigation | +| `wait_for` | str | None | CSS selector or JS condition to wait for before extracting content | +| `wait_for_images` | bool | True | If True, wait for images to load before extracting content | +| `delay_before_return_html` | float | 0.1 | Delay in seconds before retrieving final HTML | +| `mean_delay` | float | 0.1 | Mean base delay between requests when calling arun_many | +| `max_range` | float | 0.3 | Max random additional delay range for requests in arun_many | +| `semaphore_count` | int | 5 | Number of concurrent operations allowed | + +## Page Interaction Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `js_code` | str or list[str] | None | JavaScript code/snippets to run on the page | +| `js_only` | bool | False | If True, indicates subsequent calls are JS-driven updates | +| `ignore_body_visibility` | bool | True | If True, ignore whether the body is visible before proceeding | +| `scan_full_page` | bool | False | If True, scroll through the entire page to load all content | +| `scroll_delay` | float | 0.2 | Delay in seconds between scroll steps if scan_full_page is True | +| `process_iframes` | bool | False | If True, attempts to process and inline iframe content | +| `remove_overlay_elements` | bool | False | If True, remove overlays/popups before extracting HTML | +| `simulate_user` | bool | False | If True, simulate user interactions for anti-bot measures | +| `override_navigator` | bool | False | If True, overrides navigator properties for more human-like behavior | +| `magic` | bool | False | If True, attempts automatic handling of overlays/popups | +| `adjust_viewport_to_content` | bool | False | If True, adjust viewport according to page content dimensions | + +## Media Handling Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `screenshot` | bool | False | Whether to take a screenshot after crawling | +| `screenshot_wait_for` | float | None | Additional wait time before taking a screenshot | +| `screenshot_height_threshold` | int | 20000 | Threshold for page height to decide screenshot strategy | +| `pdf` | bool | False | Whether to generate a PDF of the page | +| `image_description_min_word_threshold` | int | 50 | Minimum words for image description extraction | +| `image_score_threshold` | int | 3 | Minimum score threshold for processing an image | +| `exclude_external_images` | bool | False | If True, exclude all external images from processing | + +## Link and Domain Handling Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `exclude_social_media_domains` | list[str] | SOCIAL_MEDIA_DOMAINS | List of domains to exclude for social media links | +| `exclude_external_links` | bool | False | If True, exclude all external links from the results | +| `exclude_social_media_links` | bool | False | If True, exclude links pointing to social media domains | +| `exclude_domains` | list[str] | [] | List of specific domains to exclude from results | + +## Debugging and Logging Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `verbose` | bool | True | Enable verbose logging | +| `log_console` | bool | False | If True, log console messages from the page | \ No newline at end of file diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md index 01cfe34e..73460e57 100644 --- a/docs/md_v2/basic/cache-modes.md +++ b/docs/md_v2/basic/cache-modes.md @@ -45,13 +45,15 @@ if __name__ == "__main__": ### New Code (Recommended) ```python import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.async_configs import CrawlerRunConfig async def use_proxy(): + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Use CacheMode in CrawlerRunConfig async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", - cache_mode=CacheMode.BYPASS # New way + config=config # Pass the configuration object ) print(len(result.markdown)) @@ -64,12 +66,12 @@ if __name__ == "__main__": ## Common Migration Patterns -Old Flag | New Mode ----------|---------- -`bypass_cache=True` | `cache_mode=CacheMode.BYPASS` -`disable_cache=True` | `cache_mode=CacheMode.DISABLED` -`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` -`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` +| Old Flag | New Mode | +|-----------------------|---------------------------------| +| `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` | +| `disable_cache=True` | `cache_mode=CacheMode.DISABLED`| +| `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` | +| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` | ## Suppressing Deprecation Warnings If you need time to migrate, you can temporarily suppress deprecation warnings: diff --git a/docs/md_v2/basic/content-selection.md b/docs/md_v2/basic/content-selection.md index f5f7397b..ec838f2d 100644 --- a/docs/md_v2/basic/content-selection.md +++ b/docs/md_v2/basic/content-selection.md @@ -1,68 +1,58 @@ -# Content Selection +### Content Selection Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need. -## CSS Selectors +#### CSS Selectors -The simplest way to extract specific content: +Extract specific content using a `CrawlerRunConfig` with CSS selectors: ```python -# Extract specific content using CSS selector -result = await crawler.arun( - url="https://example.com", - css_selector=".main-article" # Target main article content -) +from crawl4ai.async_configs import CrawlerRunConfig -# Multiple selectors -result = await crawler.arun( - url="https://example.com", - css_selector="article h1, article .content" # Target heading and content -) +config = CrawlerRunConfig(css_selector=".main-article") # Target main article content +result = await crawler.arun(url="https://crawl4ai.com", config=config) + +config = CrawlerRunConfig(css_selector="article h1, article .content") # Target heading and content +result = await crawler.arun(url="https://crawl4ai.com", config=config) ``` -## Content Filtering +#### Content Filtering -Control what content is included or excluded: +Control content inclusion or exclusion with `CrawlerRunConfig`: ```python -result = await crawler.arun( - url="https://example.com", - # Content thresholds +config = CrawlerRunConfig( word_count_threshold=10, # Minimum words per block - - # Tag exclusions - excluded_tags=['form', 'header', 'footer', 'nav'], - - # Link filtering + excluded_tags=['form', 'header', 'footer', 'nav'], # Excluded tags exclude_external_links=True, # Remove external links exclude_social_media_links=True, # Remove social media links - - # Media filtering exclude_external_images=True # Remove external images ) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) ``` -## Iframe Content +#### Iframe Content -Process content inside iframes: +Process iframe content by enabling specific options in `CrawlerRunConfig`: ```python -result = await crawler.arun( - url="https://example.com", - process_iframes=True, # Extract iframe content +config = CrawlerRunConfig( + process_iframes=True, # Extract iframe content remove_overlay_elements=True # Remove popups/modals that might block iframes ) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) ``` -## Structured Content Selection +#### Structured Content Selection Using LLMs -### Using LLMs for Smart Selection - -Use LLMs to intelligently extract specific types of content: +Leverage LLMs for intelligent content extraction: ```python -from pydantic import BaseModel from crawl4ai.extraction_strategy import LLMExtractionStrategy +from pydantic import BaseModel +from typing import List class ArticleContent(BaseModel): title: str @@ -70,28 +60,27 @@ class ArticleContent(BaseModel): conclusion: str strategy = LLMExtractionStrategy( - provider="ollama/nemotron", # Works with any supported LLM + provider="ollama/nemotron", schema=ArticleContent.schema(), instruction="Extract the main article title, key points, and conclusion" ) -result = await crawler.arun( - url="https://example.com", - extraction_strategy=strategy -) +config = CrawlerRunConfig(extraction_strategy=strategy) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) article = json.loads(result.extracted_content) ``` -### Pattern-Based Selection +#### Pattern-Based Selection -For repeated content patterns (like product listings, news feeds): +Extract content matching repetitive patterns: ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy schema = { "name": "News Articles", - "baseSelector": "article.news-item", # Repeated element + "baseSelector": "article.news-item", "fields": [ {"name": "headline", "selector": "h2", "type": "text"}, {"name": "summary", "selector": ".summary", "type": "text"}, @@ -108,51 +97,19 @@ schema = { } strategy = JsonCssExtractionStrategy(schema) -result = await crawler.arun( - url="https://example.com", - extraction_strategy=strategy -) +config = CrawlerRunConfig(extraction_strategy=strategy) + +result = await crawler.arun(url="https://crawl4ai.com", config=config) articles = json.loads(result.extracted_content) ``` -## Domain-Based Filtering +#### Comprehensive Example -Control content based on domains: +Combine different selection methods using `CrawlerRunConfig`: ```python -result = await crawler.arun( - url="https://example.com", - exclude_domains=["ads.com", "tracker.com"], - exclude_social_media_domains=["facebook.com", "twitter.com"], # Custom social media domains to exclude - exclude_social_media_links=True -) -``` +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -## Media Selection - -Select specific types of media: - -```python -result = await crawler.arun(url="https://example.com") - -# Access different media types -images = result.media["images"] # List of image details -videos = result.media["videos"] # List of video details -audios = result.media["audios"] # List of audio details - -# Image with metadata -for image in images: - print(f"URL: {image['src']}") - print(f"Alt text: {image['alt']}") - print(f"Description: {image['desc']}") - print(f"Relevance score: {image['score']}") -``` - -## Comprehensive Example - -Here's how to combine different selection methods: - -```python async def extract_article_content(url: str): # Define structured extraction article_schema = { @@ -163,37 +120,16 @@ async def extract_article_content(url: str): {"name": "content", "selector": ".content", "type": "text"} ] } - - # Define LLM extraction - class ArticleAnalysis(BaseModel): - key_points: List[str] - sentiment: str - category: str + + # Define configuration + config = CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy(article_schema), + word_count_threshold=10, + excluded_tags=['nav', 'footer'], + exclude_external_links=True + ) async with AsyncWebCrawler() as crawler: - # Get structured content - pattern_result = await crawler.arun( - url=url, - extraction_strategy=JsonCssExtractionStrategy(article_schema), - word_count_threshold=10, - excluded_tags=['nav', 'footer'], - exclude_external_links=True - ) - - # Get semantic analysis - analysis_result = await crawler.arun( - url=url, - extraction_strategy=LLMExtractionStrategy( - provider="ollama/nemotron", - schema=ArticleAnalysis.schema(), - instruction="Analyze the article content" - ) - ) - - # Combine results - return { - "article": json.loads(pattern_result.extracted_content), - "analysis": json.loads(analysis_result.extracted_content), - "media": pattern_result.media - } -``` \ No newline at end of file + result = await crawler.arun(url=url, config=config) + return json.loads(result.extracted_content) +``` diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md index 0d327f2e..14f48ec6 100644 --- a/docs/md_v2/basic/content_filtering.md +++ b/docs/md_v2/basic/content_filtering.md @@ -1,136 +1,83 @@ # Content Filtering in Crawl4AI -This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. ## Relevance Content Filter -The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. - +The `RelevanceContentFilter` is an abstract class providing a common interface for content filtering strategies. Specific algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. ## Pruning Content Filter -The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold. +The `PruningContentFilter` removes less relevant nodes based on metrics like text density, link density, and tag importance. Nodes that fall below a defined threshold are pruned, leaving only high-value content. ### Usage ```python -from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.content_filter_strategy import PruningContentFilter -async def filter_content(url): - async with AsyncWebCrawler() as crawler: - content_filter = PruningContentFilter( - min_word_threshold=5, - threshold_type='dynamic', - threshold=0.45 - ) - result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) - if result.success: - print(f"Cleaned Markdown:\n{result.fit_markdown}") +config = CrawlerRunConfig( + content_filter=PruningContentFilter( + min_word_threshold=5, + threshold_type='dynamic', + threshold=0.45 + ), + fit_markdown=True # Activates markdown fitting +) + +result = await crawler.arun(url="https://example.com", config=config) + +if result.success: + print(f"Cleaned Markdown:\n{result.fit_markdown}") ``` ### Parameters - **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned. - - **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated: - - `'fixed'`: Uses a constant threshold value for all nodes - - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios - -- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning: - - For fixed threshold: Nodes scoring below this value are removed - - For dynamic threshold: This value is adjusted based on node properties + - `'fixed'`: Uses a constant threshold value for all nodes. + - `'dynamic'`: Adjusts thresholds based on node properties (e.g., tag importance, text/link ratios). +- **`threshold`**: (Optional, default 0.48) Base threshold for pruning: + - Fixed: Nodes scoring below this value are removed. + - Dynamic: This value adjusts based on node characteristics. ### How It Works -The pruning algorithm evaluates each node using multiple metrics: -- Text density: Ratio of actual text to overall node content -- Link density: Proportion of text within links -- Tag importance: Weight based on HTML tag type (e.g., article, p, div) -- Content quality: Metrics like text length and structural importance - -Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks. - -The algorithm is particularly effective for: -- Removing boilerplate content -- Eliminating navigation menus and sidebars -- Preserving main article content -- Maintaining document structure while removing noise - +The algorithm evaluates each node using: +- **Text density**: Ratio of text to overall content. +- **Link density**: Proportion of text within links. +- **Tag importance**: Weights based on HTML tag type (e.g., `
`, `

`, `

`). +- **Content quality**: Metrics like text length and structural importance. ## BM25 Algorithm -The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. +The `BM25ContentFilter` uses the BM25 algorithm to rank and extract text chunks based on relevance to a search query or page metadata. ### Usage -To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. - ```python -from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.content_filter_strategy import BM25ContentFilter -async def filter_content(url, query=None): - async with AsyncWebCrawler() as crawler: - content_filter = BM25ContentFilter(user_query=query) - result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering - if result.success: - print(f"Filtered Content (JSON):\n{result.extracted_content}") - print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object - print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. - else: - print("Error:", result.error_message) +config = CrawlerRunConfig( + content_filter=BM25ContentFilter(user_query="fruit nutrition health"), + fit_markdown=True # Activates markdown fitting +) -# Example usage: -asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query -asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. +result = await crawler.arun(url="https://example.com", config=config) +if result.success: + print(f"Filtered Content:\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") + print(f"\nFiltered HTML:\n{result.fit_html}") +else: + print("Error:", result.error_message) ``` ### Parameters -- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. -- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts metadata (title, description, keywords) and uses it as the query. +- **`bm25_threshold`**: (Optional, default 1.0) Threshold controlling relevance: + - Higher values return stricter, more relevant results. + - Lower values include more lenient filtering. - -## Fit Markdown Flag - -Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. - - -## Custom Content Filtering Strategies - -You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. - -```python -from crawl4ai.content_filter_strategy import RelevantContentFilter -from bs4 import BeautifulSoup, Tag -from typing import List - -class MyCustomFilter(RelevantContentFilter): - def filter_content(self, html: str) -> List[str]: - soup = BeautifulSoup(html, 'lxml') - # Implement custom filtering logic here - # Example: extract all paragraphs within divs with class "article-body" - filtered_paragraphs = [] - for tag in soup.select("div.article-body p"): - if isinstance(tag, Tag): - filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. - return filtered_paragraphs - - - -async def custom_filter_demo(url: str): - async with AsyncWebCrawler() as crawler: - custom_filter = MyCustomFilter() - result = await crawler.arun(url, extraction_strategy=custom_filter) - if result.success: - print(result.extracted_content) - -``` - -This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. - -## Conclusion - -Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index 87e468aa..31d33e8c 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -310,22 +310,6 @@ response = requests.post("http://localhost:11235/crawl", json=request) > **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! - - - - - - - - - - - - - - - - ## Usage Examples 📝 ### Basic Crawling diff --git a/docs/md_v2/basic/file-download.md b/docs/md_v2/basic/file-download.md index c37e8812..eac0f5cb 100644 --- a/docs/md_v2/basic/file-download.md +++ b/docs/md_v2/basic/file-download.md @@ -1,124 +1,109 @@ # Download Handling in Crawl4AI -This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. +This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. ## Enabling Downloads -By default, Crawl4AI does not download files. To enable downloads, set the `accept_downloads` parameter to `True` in either the `AsyncWebCrawler` constructor or the `arun` method. +To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler. ```python -from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler async def main(): - async with AsyncWebCrawler(accept_downloads=True) as crawler: # Globally enable downloads + config = BrowserConfig(accept_downloads=True) # Enable downloads globally + async with AsyncWebCrawler(config=config) as crawler: # ... your crawling logic ... asyncio.run(main()) ``` -Or, enable it for a specific crawl: +Or, enable it for a specific crawl by using `CrawlerRunConfig`: ```python +from crawl4ai.async_configs import CrawlerRunConfig + async def main(): async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="...", accept_downloads=True) + config = CrawlerRunConfig(accept_downloads=True) + result = await crawler.arun(url="https://example.com", config=config) # ... ``` ## Specifying Download Location -You can specify the download directory using the `downloads_path` parameter. If not provided, Crawl4AI creates a "downloads" directory inside the `.crawl4ai` folder in your home directory. +Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory. ```python +from crawl4ai.async_configs import BrowserConfig import os -from pathlib import Path - -# ... inside your crawl function: downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path os.makedirs(downloads_path, exist_ok=True) -result = await crawler.arun(url="...", downloads_path=downloads_path, accept_downloads=True) +config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path) -# ... -``` - -If you are setting it globally, provide the path to the AsyncWebCrawler: -```python -async def crawl_with_downloads(url: str, download_path: str): - async with AsyncWebCrawler( - accept_downloads=True, - downloads_path=download_path, # or set it on arun - verbose=True - ) as crawler: - result = await crawler.arun(url=url) # you still need to enable downloads per call. +async def main(): + async with AsyncWebCrawler(config=config) as crawler: + result = await crawler.arun(url="https://example.com") # ... ``` - - ## Triggering Downloads -Downloads are typically triggered by user interactions on a web page (e.g., clicking a download button). You can simulate these actions with the `js_code` parameter, injecting JavaScript code to be executed within the browser context. The `wait_for` parameter might also be crucial to allowing sufficient time for downloads to initiate before the crawler proceeds. +Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start. ```python -result = await crawler.arun( - url="https://www.python.org/downloads/", +from crawl4ai.async_configs import CrawlerRunConfig + +config = CrawlerRunConfig( js_code=""" - // Find and click the first Windows installer link const downloadLink = document.querySelector('a[href$=".exe"]'); if (downloadLink) { downloadLink.click(); } """, - wait_for=5 # Wait for 5 seconds for the download to start + wait_for=5 # Wait 5 seconds for the download to start ) + +result = await crawler.arun(url="https://www.python.org/downloads/", config=config) ``` ## Accessing Downloaded Files -Downloaded file paths are stored in the `downloaded_files` attribute of the returned `CrawlResult` object. This is a list of strings, with each string representing the absolute path to a downloaded file. +The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files. ```python if result.downloaded_files: print("Downloaded files:") for file_path in result.downloaded_files: print(f"- {file_path}") - # Perform operations with downloaded files, e.g., check file size file_size = os.path.getsize(file_path) print(f"- File size: {file_size} bytes") else: print("No files downloaded.") ``` - -## Example: Downloading Multiple Files +## Example: Downloading Multiple Files ```python -import asyncio +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig import os from pathlib import Path -from crawl4ai import AsyncWebCrawler async def download_multiple_files(url: str, download_path: str): - - async with AsyncWebCrawler( - accept_downloads=True, - downloads_path=download_path, - verbose=True - ) as crawler: - result = await crawler.arun( - url=url, + config = BrowserConfig(accept_downloads=True, downloads_path=download_path) + async with AsyncWebCrawler(config=config) as crawler: + run_config = CrawlerRunConfig( js_code=""" - // Trigger multiple downloads (example) - const downloadLinks = document.querySelectorAll('a[download]'); // Or a more specific selector - for (const link of downloadLinks) { - link.click(); - await new Promise(r => setTimeout(r, 2000)); // Add a small delay between clicks if needed - } + const downloadLinks = document.querySelectorAll('a[download]'); + for (const link of downloadLinks) { + link.click(); + await new Promise(r => setTimeout(r, 2000)); // Delay between clicks + } """, - wait_for=10 # Adjust the timeout to match the expected time for all downloads to start + wait_for=10 # Wait for all downloads to start ) + result = await crawler.arun(url=url, config=run_config) if result.downloaded_files: print("Downloaded files:") @@ -126,23 +111,19 @@ async def download_multiple_files(url: str, download_path: str): print(f"- {file}") else: print("No files downloaded.") - -# Example usage +# Usage download_path = os.path.join(Path.home(), ".crawl4ai", "downloads") -os.makedirs(download_path, exist_ok=True) # Create directory if it doesn't exist - +os.makedirs(download_path, exist_ok=True) asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path)) ``` ## Important Considerations -- **Browser Context:** Downloads are managed within the browser context. Ensure your `js_code` correctly targets the download triggers on the specific web page. -- **Waiting:** Use `wait_for` to manage the timing of the crawl process if immediate download might not occur. -- **Error Handling:** Implement proper error handling to gracefully manage failed downloads or incorrect file paths. -- **Security:** Downloaded files should be scanned for potential security threats before use. +- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage. +- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing. +- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully. +- **Security:** Scan downloaded files for potential security threats before use. - - -This guide provides a foundation for handling downloads with Crawl4AI. You can adapt these techniques to manage downloads in various scenarios and integrate them into more complex crawling workflows. +This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed! \ No newline at end of file diff --git a/docs/md_v2/basic/output-formats.md b/docs/md_v2/basic/output-formats.md index 0d25e884..3686c23c 100644 --- a/docs/md_v2/basic/output-formats.md +++ b/docs/md_v2/basic/output-formats.md @@ -1,6 +1,6 @@ # Output Formats -Crawl4AI provides multiple output formats to suit different needs, from raw HTML to structured data using LLM or pattern-based extraction. +Crawl4AI provides multiple output formats to suit different needs, ranging from raw HTML to structured data using LLM or pattern-based extraction, and versatile markdown outputs. ## Basic Formats @@ -8,18 +8,20 @@ Crawl4AI provides multiple output formats to suit different needs, from raw HTML result = await crawler.arun(url="https://example.com") # Access different formats -raw_html = result.html # Original HTML -clean_html = result.cleaned_html # Sanitized HTML -markdown = result.markdown # Standard markdown -fit_md = result.fit_markdown # Most relevant content in markdown +raw_html = result.html # Original HTML +clean_html = result.cleaned_html # Sanitized HTML +markdown_v2 = result.markdown_v2 # Detailed markdown generation results +fit_md = result.markdown_v2.fit_markdown # Most relevant content in markdown ``` +> **Note**: The `markdown_v2` property will soon be replaced by `markdown`. It is recommended to start transitioning to using `markdown` for new implementations. + ## Raw HTML Original, unmodified HTML from the webpage. Useful when you need to: -- Preserve the exact page structure -- Process HTML with your own tools -- Debug page issues +- Preserve the exact page structure. +- Process HTML with your own tools. +- Debug page issues. ```python result = await crawler.arun(url="https://example.com") @@ -29,167 +31,72 @@ print(result.html) # Complete HTML including headers, scripts, etc. ## Cleaned HTML Sanitized HTML with unnecessary elements removed. Automatically: -- Removes scripts and styles -- Cleans up formatting -- Preserves semantic structure +- Removes scripts and styles. +- Cleans up formatting. +- Preserves semantic structure. ```python -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( excluded_tags=['form', 'header', 'footer'], # Additional tags to remove keep_data_attributes=False # Remove data-* attributes ) +result = await crawler.arun(url="https://example.com", config=config) print(result.cleaned_html) ``` ## Standard Markdown -HTML converted to clean markdown format. Great for: -- Content analysis -- Documentation -- Readability +HTML converted to clean markdown format. This output is useful for: +- Content analysis. +- Documentation. +- Readability. ```python -result = await crawler.arun( - url="https://example.com", - include_links_on_markdown=True # Include links in markdown +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + options={"include_links": True} # Include links in markdown + ) ) -print(result.markdown) +result = await crawler.arun(url="https://example.com", config=config) +print(result.markdown_v2.raw_markdown) # Standard markdown with links ``` ## Fit Markdown -Most relevant content extracted and converted to markdown. Ideal for: -- Article extraction -- Main content focus -- Removing boilerplate +Extract and convert only the most relevant content into markdown format. Best suited for: +- Article extraction. +- Focusing on the main content. +- Removing boilerplate. + +To generate `fit_markdown`, use a content filter like `PruningContentFilter`: ```python -result = await crawler.arun(url="https://example.com") -print(result.fit_markdown) # Only the main content +from crawl4ai.content_filter_strategy import PruningContentFilter + +config = CrawlerRunConfig( + content_filter=PruningContentFilter( + threshold=0.7, + threshold_type="dynamic", + min_word_threshold=100 + ) +) +result = await crawler.arun(url="https://example.com", config=config) +print(result.markdown_v2.fit_markdown) # Extracted main content in markdown ``` -## Structured Data Extraction +## Markdown with Citations -Crawl4AI offers two powerful approaches for structured data extraction: - -### 1. LLM-Based Extraction - -Use any LLM (OpenAI, HuggingFace, Ollama, etc.) to extract structured data with high accuracy: +Generate markdown that includes citations for links. This format is ideal for: +- Creating structured documentation. +- Including references for extracted content. ```python -from pydantic import BaseModel -from crawl4ai.extraction_strategy import LLMExtractionStrategy - -class KnowledgeGraph(BaseModel): - entities: List[dict] - relationships: List[dict] - -strategy = LLMExtractionStrategy( - provider="ollama/nemotron", # or "huggingface/...", "ollama/..." - api_token="your-token", # not needed for Ollama - schema=KnowledgeGraph.schema(), - instruction="Extract entities and relationships from the content" +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + options={"citations": True} # Enable citations + ) ) - -result = await crawler.arun( - url="https://example.com", - extraction_strategy=strategy -) -knowledge_graph = json.loads(result.extracted_content) +result = await crawler.arun(url="https://example.com", config=config) +print(result.markdown_v2.markdown_with_citations) +print(result.markdown_v2.references_markdown) # Citations section ``` - -### 2. Pattern-Based Extraction - -For pages with repetitive patterns (e.g., product listings, article feeds), use JsonCssExtractionStrategy: - -```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - -schema = { - "name": "Product Listing", - "baseSelector": ".product-card", # Repeated element - "fields": [ - {"name": "title", "selector": "h2", "type": "text"}, - {"name": "price", "selector": ".price", "type": "text"}, - {"name": "description", "selector": ".desc", "type": "text"} - ] -} - -strategy = JsonCssExtractionStrategy(schema) -result = await crawler.arun( - url="https://example.com", - extraction_strategy=strategy -) -products = json.loads(result.extracted_content) -``` - -## Content Customization - -### HTML to Text Options - -Configure markdown conversion: - -```python -result = await crawler.arun( - url="https://example.com", - html2text={ - "escape_dot": False, - "body_width": 0, - "protect_links": True, - "unicode_snob": True - } -) -``` - -### Content Filters - -Control what content is included: - -```python -result = await crawler.arun( - url="https://example.com", - word_count_threshold=10, # Minimum words per block - exclude_external_links=True, # Remove external links - exclude_external_images=True, # Remove external images - excluded_tags=['form', 'nav'] # Remove specific HTML tags -) -``` - -## Comprehensive Example - -Here's how to use multiple output formats together: - -```python -async def crawl_content(url: str): - async with AsyncWebCrawler() as crawler: - # Extract main content with fit markdown - result = await crawler.arun( - url=url, - word_count_threshold=10, - exclude_external_links=True - ) - - # Get structured data using LLM - llm_result = await crawler.arun( - url=url, - extraction_strategy=LLMExtractionStrategy( - provider="ollama/nemotron", - schema=YourSchema.schema(), - instruction="Extract key information" - ) - ) - - # Get repeated patterns (if any) - pattern_result = await crawler.arun( - url=url, - extraction_strategy=JsonCssExtractionStrategy(your_schema) - ) - - return { - "main_content": result.fit_markdown, - "structured_data": json.loads(llm_result.extracted_content), - "pattern_data": json.loads(pattern_result.extracted_content), - "media": result.media - } -``` \ No newline at end of file diff --git a/docs/md_v2/basic/page-interaction.md b/docs/md_v2/basic/page-interaction.md index 7555f225..07a2c9cd 100644 --- a/docs/md_v2/basic/page-interaction.md +++ b/docs/md_v2/basic/page-interaction.md @@ -7,11 +7,13 @@ Crawl4AI provides powerful features for interacting with dynamic webpages, handl ### Basic Execution ```python +from crawl4ai.async_configs import CrawlerRunConfig + # Single JavaScript command -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( js_code="window.scrollTo(0, document.body.scrollHeight);" ) +result = await crawler.arun(url="https://example.com", config=config) # Multiple commands js_commands = [ @@ -19,10 +21,8 @@ js_commands = [ "document.querySelector('.load-more').click();", "document.querySelector('#consent-button').click();" ] -result = await crawler.arun( - url="https://example.com", - js_code=js_commands -) +config = CrawlerRunConfig(js_code=js_commands) +result = await crawler.arun(url="https://example.com", config=config) ``` ## Wait Conditions @@ -32,10 +32,8 @@ result = await crawler.arun( Wait for elements to appear: ```python -result = await crawler.arun( - url="https://example.com", - wait_for="css:.dynamic-content" # Wait for element with class 'dynamic-content' -) +config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content' +result = await crawler.arun(url="https://example.com", config=config) ``` ### JavaScript-Based Waiting @@ -48,10 +46,8 @@ wait_condition = """() => { return document.querySelectorAll('.item').length > 10; }""" -result = await crawler.arun( - url="https://example.com", - wait_for=f"js:{wait_condition}" -) +config = CrawlerRunConfig(wait_for=f"js:{wait_condition}") +result = await crawler.arun(url="https://example.com", config=config) # Wait for dynamic content to load wait_for_content = """() => { @@ -59,10 +55,8 @@ wait_for_content = """() => { return content && content.innerText.length > 100; }""" -result = await crawler.arun( - url="https://example.com", - wait_for=f"js:{wait_for_content}" -) +config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}") +result = await crawler.arun(url="https://example.com", config=config) ``` ## Handling Dynamic Content @@ -72,18 +66,14 @@ result = await crawler.arun( Handle infinite scroll or load more buttons: ```python -# Scroll and wait pattern -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( js_code=[ - # Scroll to bottom - "window.scrollTo(0, document.body.scrollHeight);", - # Click load more if exists - "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" + "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom + "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more ], - # Wait for new content - wait_for="js:() => document.querySelectorAll('.item').length > previousCount" + wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content ) +result = await crawler.arun(url="https://example.com", config=config) ``` ### Form Interaction @@ -92,17 +82,15 @@ Handle forms and inputs: ```python js_form_interaction = """ - // Fill form fields - document.querySelector('#search').value = 'search term'; - // Submit form - document.querySelector('form').submit(); + document.querySelector('#search').value = 'search term'; // Fill form fields + document.querySelector('form').submit(); // Submit form """ -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( js_code=js_form_interaction, wait_for="css:.results" # Wait for results to load ) +result = await crawler.arun(url="https://example.com", config=config) ``` ## Timing Control @@ -112,11 +100,11 @@ result = await crawler.arun( Control timing of interactions: ```python -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( page_timeout=60000, # Page load timeout (ms) - delay_before_return_html=2.0, # Wait before capturing content + delay_before_return_html=2.0 # Wait before capturing content ) +result = await crawler.arun(url="https://example.com", config=config) ``` ## Complex Interactions Example @@ -124,43 +112,37 @@ result = await crawler.arun( Here's an example of handling a dynamic page with multiple interactions: ```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + async def crawl_dynamic_content(): async with AsyncWebCrawler() as crawler: # Initial page load - result = await crawler.arun( - url="https://example.com", - # Handle cookie consent - js_code="document.querySelector('.cookie-accept')?.click();", + config = CrawlerRunConfig( + js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent wait_for="css:.main-content" ) + result = await crawler.arun(url="https://example.com", config=config) # Load more content session_id = "dynamic_session" # Keep session for multiple interactions for page in range(3): # Load 3 pages of content - result = await crawler.arun( - url="https://example.com", + config = CrawlerRunConfig( session_id=session_id, js_code=[ - # Scroll to bottom - "window.scrollTo(0, document.body.scrollHeight);", - # Store current item count - "window.previousCount = document.querySelectorAll('.item').length;", - # Click load more - "document.querySelector('.load-more')?.click();" + "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom + "window.previousCount = document.querySelectorAll('.item').length;", # Store item count + "document.querySelector('.load-more')?.click();" # Click load more ], - # Wait for new items wait_for="""() => { const currentCount = document.querySelectorAll('.item').length; return currentCount > window.previousCount; }""", - # Only execute JS without reloading page - js_only=True if page > 0 else False + js_only=(page > 0) # Execute JS without reloading page for subsequent interactions ) - - # Process content after each load + result = await crawler.arun(url="https://example.com", config=config) print(f"Page {page + 1} items:", len(result.cleaned_html)) - + # Clean up session await crawler.crawler_strategy.kill_session(session_id) ``` @@ -171,6 +153,7 @@ Combine page interaction with structured extraction: ```python from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai.async_configs import CrawlerRunConfig # Pattern-based extraction after interaction schema = { @@ -182,20 +165,19 @@ schema = { ] } -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( js_code="window.scrollTo(0, document.body.scrollHeight);", wait_for="css:.item:nth-child(10)", # Wait for 10 items extraction_strategy=JsonCssExtractionStrategy(schema) ) +result = await crawler.arun(url="https://example.com", config=config) # Or use LLM to analyze dynamic content class ContentAnalysis(BaseModel): topics: List[str] summary: str -result = await crawler.arun( - url="https://example.com", +config = CrawlerRunConfig( js_code="document.querySelector('.show-more').click();", wait_for="css:.full-content", extraction_strategy=LLMExtractionStrategy( @@ -204,4 +186,5 @@ result = await crawler.arun( instruction="Analyze the full content" ) ) -``` \ No newline at end of file +result = await crawler.arun(url="https://example.com", config=config) +``` diff --git a/docs/md_v2/basic/prefix-based-input.md b/docs/md_v2/basic/prefix-based-input.md index 42987a67..6dfae9d4 100644 --- a/docs/md_v2/basic/prefix-based-input.md +++ b/docs/md_v2/basic/prefix-based-input.md @@ -2,31 +2,19 @@ This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example. -## Table of Contents -- [Prefix-Based Input Handling in Crawl4AI](#prefix-based-input-handling-in-crawl4ai) - - [Table of Contents](#table-of-contents) - - [Crawling a Web URL](#crawling-a-web-url) - - [Crawling a Local HTML File](#crawling-a-local-html-file) - - [Crawling Raw HTML Content](#crawling-raw-html-content) - - [Complete Example](#complete-example) - - [**How It Works**](#how-it-works) - - [**Running the Example**](#running-the-example) - - [Conclusion](#conclusion) +## Crawling a Web URL ---- - - -### Crawling a Web URL - -To crawl a live web page, provide the URL starting with `http://` or `https://`. +To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object: ```python import asyncio from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig async def crawl_web(): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", bypass_cache=True) + config = CrawlerRunConfig(bypass_cache=True) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config) if result.success: print("Markdown Content:") print(result.markdown) @@ -36,20 +24,22 @@ async def crawl_web(): asyncio.run(crawl_web()) ``` -### Crawling a Local HTML File +## Crawling a Local HTML File To crawl a local HTML file, prefix the file path with `file://`. ```python import asyncio from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig async def crawl_local_file(): local_file_path = "/path/to/apple.html" # Replace with your file path file_url = f"file://{local_file_path}" + config = CrawlerRunConfig(bypass_cache=True) - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url=file_url, bypass_cache=True) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=file_url, config=config) if result.success: print("Markdown Content from Local File:") print(result.markdown) @@ -59,20 +49,22 @@ async def crawl_local_file(): asyncio.run(crawl_local_file()) ``` -### Crawling Raw HTML Content +## Crawling Raw HTML Content To crawl raw HTML content, prefix the HTML string with `raw:`. ```python import asyncio from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig async def crawl_raw_html(): raw_html = "

Hello, World!

" raw_html_url = f"raw:{raw_html}" + config = CrawlerRunConfig(bypass_cache=True) - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url=raw_html_url, bypass_cache=True) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=raw_html_url, config=config) if result.success: print("Markdown Content from Raw HTML:") print(result.markdown) @@ -84,152 +76,83 @@ asyncio.run(crawl_raw_html()) --- -## Complete Example +# Complete Example Below is a comprehensive script that: -1. **Crawls the Wikipedia page for "Apple".** -2. **Saves the HTML content to a local file (`apple.html`).** -3. **Crawls the local HTML file and verifies the markdown length matches the original crawl.** -4. **Crawls the raw HTML content from the saved file and verifies consistency.** + +1. Crawls the Wikipedia page for "Apple." +2. Saves the HTML content to a local file (`apple.html`). +3. Crawls the local HTML file and verifies the markdown length matches the original crawl. +4. Crawls the raw HTML content from the saved file and verifies consistency. ```python import os import sys import asyncio from pathlib import Path - -# Adjust the parent directory to include the crawl4ai module -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig async def main(): - # Define the URL to crawl wikipedia_url = "https://en.wikipedia.org/wiki/apple" - - # Define the path to save the HTML file - # Save the file in the same directory as the script script_dir = Path(__file__).parent html_file_path = script_dir / "apple.html" - - async with AsyncWebCrawler(verbose=True) as crawler: + + async with AsyncWebCrawler() as crawler: + # Step 1: Crawl the Web URL print("\n=== Step 1: Crawling the Wikipedia URL ===") - # Crawl the Wikipedia URL - result = await crawler.arun(url=wikipedia_url, bypass_cache=True) - - # Check if crawling was successful + web_config = CrawlerRunConfig(bypass_cache=True) + result = await crawler.arun(url=wikipedia_url, config=web_config) + if not result.success: print(f"Failed to crawl {wikipedia_url}: {result.error_message}") return - - # Save the HTML content to a local file + with open(html_file_path, 'w', encoding='utf-8') as f: f.write(result.html) - print(f"Saved HTML content to {html_file_path}") - - # Store the length of the generated markdown web_crawl_length = len(result.markdown) print(f"Length of markdown from web crawl: {web_crawl_length}\n") - + + # Step 2: Crawl from the Local HTML File print("=== Step 2: Crawling from the Local HTML File ===") - # Construct the file URL with 'file://' prefix file_url = f"file://{html_file_path.resolve()}" - - # Crawl the local HTML file - local_result = await crawler.arun(url=file_url, bypass_cache=True) - - # Check if crawling was successful + file_config = CrawlerRunConfig(bypass_cache=True) + local_result = await crawler.arun(url=file_url, config=file_config) + if not local_result.success: print(f"Failed to crawl local file {file_url}: {local_result.error_message}") return - - # Store the length of the generated markdown from local file + local_crawl_length = len(local_result.markdown) - print(f"Length of markdown from local file crawl: {local_crawl_length}") - - # Compare the lengths - assert web_crawl_length == local_crawl_length, ( - f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Local file crawl ({local_crawl_length})" - ) - print("✅ Markdown length matches between web crawl and local file crawl.\n") - + assert web_crawl_length == local_crawl_length, "Markdown length mismatch" + print("✅ Markdown length matches between web and local file crawl.\n") + + # Step 3: Crawl Using Raw HTML Content print("=== Step 3: Crawling Using Raw HTML Content ===") - # Read the HTML content from the saved file with open(html_file_path, 'r', encoding='utf-8') as f: raw_html_content = f.read() - - # Prefix the raw HTML content with 'raw:' raw_html_url = f"raw:{raw_html_content}" - - # Crawl using the raw HTML content - raw_result = await crawler.arun(url=raw_html_url, bypass_cache=True) - - # Check if crawling was successful + raw_config = CrawlerRunConfig(bypass_cache=True) + raw_result = await crawler.arun(url=raw_html_url, config=raw_config) + if not raw_result.success: print(f"Failed to crawl raw HTML content: {raw_result.error_message}") return - - # Store the length of the generated markdown from raw HTML + raw_crawl_length = len(raw_result.markdown) - print(f"Length of markdown from raw HTML crawl: {raw_crawl_length}") - - # Compare the lengths - assert web_crawl_length == raw_crawl_length, ( - f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Raw HTML crawl ({raw_crawl_length})" - ) - print("✅ Markdown length matches between web crawl and raw HTML crawl.\n") - + assert web_crawl_length == raw_crawl_length, "Markdown length mismatch" + print("✅ Markdown length matches between web and raw HTML crawl.\n") + print("All tests passed successfully!") - - # Clean up by removing the saved HTML file if html_file_path.exists(): os.remove(html_file_path) - print(f"Removed the saved HTML file: {html_file_path}") -# Run the main function if __name__ == "__main__": asyncio.run(main()) ``` -### **How It Works** - -1. **Step 1: Crawl the Web URL** - - Crawls `https://en.wikipedia.org/wiki/apple`. - - Saves the HTML content to `apple.html`. - - Records the length of the generated markdown. - -2. **Step 2: Crawl from the Local HTML File** - - Uses the `file://` prefix to crawl `apple.html`. - - Ensures the markdown length matches the original web crawl. - -3. **Step 3: Crawl Using Raw HTML Content** - - Reads the HTML from `apple.html`. - - Prefixes it with `raw:` and crawls. - - Verifies the markdown length matches the previous results. - -4. **Cleanup** - - Deletes the `apple.html` file after testing. - -### **Running the Example** - -1. **Save the Script:** - - Save the above code as `test_crawl4ai.py` in your project directory. - -2. **Execute the Script:** - - Run the script using: - ```bash - python test_crawl4ai.py - ``` - -3. **Observe the Output:** - - The script will print logs detailing each step. - - Assertions ensure consistency across different crawling methods. - - Upon success, it confirms that all markdown lengths match. - --- -## Conclusion - -With the new prefix-based input handling in **Crawl4AI**, you can effortlessly crawl web URLs, local HTML files, and raw HTML strings using a unified `url` parameter. This enhancement simplifies the API usage and provides greater flexibility for diverse crawling scenarios. +# Conclusion +With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios. \ No newline at end of file diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md index c18cd7d1..ffc35986 100644 --- a/docs/md_v2/basic/quickstart.md +++ b/docs/md_v2/basic/quickstart.md @@ -1,49 +1,66 @@ # Quick Start Guide 🚀 -Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI with a friendly and humorous tone. We'll cover everything from basic usage to advanced features like chunking and extraction strategies, all with the power of asynchronous programming. Let's dive in! 🌟 +Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI, covering everything from initial setup to advanced features like chunking and extraction strategies, using asynchronous programming. Let's dive in! 🌟 + +--- ## Getting Started 🛠️ -First, let's import the necessary modules and create an instance of `AsyncWebCrawler`. We'll use an async context manager, which handles the setup and teardown of the crawler for us. +Set up your environment with `BrowserConfig` and create an `AsyncWebCrawler` instance. ```python import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - # We'll add our crawling code here + browser_config = BrowserConfig(verbose=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + # Add your crawling logic here pass if __name__ == "__main__": asyncio.run(main()) ``` +--- + ### Basic Usage -Simply provide a URL and let Crawl4AI do the magic! +Provide a URL and let Crawl4AI do the work! ```python +from crawl4ai.async_configs import CrawlerRunConfig + async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") + browser_config = BrowserConfig(verbose=True) + crawl_config = CrawlerRunConfig(url="https://www.nbcnews.com/business") + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) print(f"Basic crawl result: {result.markdown[:500]}") # Print first 500 characters -asyncio.run(main()) +if __name__ == "__main__": + asyncio.run(main()) ``` +--- + ### Taking Screenshots 📸 -Capture screenshots of web pages easily: +Capture and save webpage screenshots with `CrawlerRunConfig`: ```python +from crawl4ai.async_configs import CacheMode + async def capture_and_save_screenshot(url: str, output_path: str): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun( - url=url, - screenshot=True, - cache_mode=CacheMode.BYPASS - ) + browser_config = BrowserConfig(verbose=True) + crawl_config = CrawlerRunConfig( + url=url, + screenshot=True, + cache_mode=CacheMode.BYPASS + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) if result.success and result.screenshot: import base64 @@ -55,243 +72,101 @@ async def capture_and_save_screenshot(url: str, output_path: str): print("Failed to capture screenshot") ``` +--- + ### Browser Selection 🌐 -Crawl4AI supports multiple browser engines. Here's how to use different browsers: +Choose from multiple browser engines using `BrowserConfig`: ```python +from crawl4ai.async_configs import BrowserConfig + # Use Firefox -async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) +firefox_config = BrowserConfig(browser_type="firefox", verbose=True, headless=True) +async with AsyncWebCrawler(config=firefox_config) as crawler: + result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) # Use WebKit -async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) +webkit_config = BrowserConfig(browser_type="webkit", verbose=True, headless=True) +async with AsyncWebCrawler(config=webkit_config) as crawler: + result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) # Use Chromium (default) -async with AsyncWebCrawler(verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) +chromium_config = BrowserConfig(verbose=True, headless=True) +async with AsyncWebCrawler(config=chromium_config) as crawler: + result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com")) ``` +--- + ### User Simulation 🎭 -Simulate real user behavior to avoid detection: +Simulate real user behavior to bypass detection: ```python -async with AsyncWebCrawler(verbose=True, headless=True) as crawler: - result = await crawler.arun( - url="YOUR-URL-HERE", - cache_mode=CacheMode.BYPASS, - simulate_user=True, # Causes random mouse movements and clicks - override_navigator=True # Makes the browser appear more like a real user - ) +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +browser_config = BrowserConfig(verbose=True, headless=True) +crawl_config = CrawlerRunConfig( + url="YOUR-URL-HERE", + cache_mode=CacheMode.BYPASS, + simulate_user=True, # Random mouse movements and clicks + override_navigator=True # Makes the browser appear like a real user +) +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) ``` +--- + ### Understanding Parameters 🧠 -By default, Crawl4AI caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action. +Explore caching and forcing fresh crawls: ```python async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - # First crawl (caches the result) - result1 = await crawler.arun(url="https://www.nbcnews.com/business") + browser_config = BrowserConfig(verbose=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # First crawl (uses cache) + result1 = await crawler.arun(config=CrawlerRunConfig(url="https://www.nbcnews.com/business")) print(f"First crawl result: {result1.markdown[:100]}...") - # Force to crawl again - result2 = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS) + # Force fresh crawl + result2 = await crawler.arun( + config=CrawlerRunConfig(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS) + ) print(f"Second crawl result: {result2.markdown[:100]}...") -asyncio.run(main()) +if __name__ == "__main__": + asyncio.run(main()) ``` +--- + ### Adding a Chunking Strategy 🧩 -Let's add a chunking strategy: `RegexChunking`! This strategy splits the text based on a given regex pattern. +Split content into chunks using `RegexChunking`: ```python from crawl4ai.chunking_strategy import RegexChunking async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun( - url="https://www.nbcnews.com/business", - chunking_strategy=RegexChunking(patterns=["\n\n"]) - ) + browser_config = BrowserConfig(verbose=True) + crawl_config = CrawlerRunConfig( + url="https://www.nbcnews.com/business", + chunking_strategy=RegexChunking(patterns=["\n\n"]) + ) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(config=crawl_config) print(f"RegexChunking result: {result.extracted_content[:200]}...") -asyncio.run(main()) +if __name__ == "__main__": + asyncio.run(main()) ``` -### Using LLMExtractionStrategy with Different Providers 🤖 +--- -Crawl4AI supports multiple LLM providers for extraction: +### Advanced Features and Configurations -```python -from crawl4ai.extraction_strategy import LLMExtractionStrategy -from pydantic import BaseModel, Field - -class OpenAIModelFee(BaseModel): - model_name: str = Field(..., description="Name of the OpenAI model.") - input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") - output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") - -# OpenAI -await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) - -# Hugging Face -await extract_structured_data_using_llm( - "huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", - os.getenv("HUGGINGFACE_API_KEY") -) - -# Ollama -await extract_structured_data_using_llm("ollama/llama3.2") - -# With custom headers -custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" -} -await extract_structured_data_using_llm(extra_headers=custom_headers) -``` - -### Knowledge Graph Generation 🕸️ - -Generate knowledge graphs from web content: - -```python -from pydantic import BaseModel -from typing import List - -class Entity(BaseModel): - name: str - description: str - -class Relationship(BaseModel): - entity1: Entity - entity2: Entity - description: str - relation_type: str - -class KnowledgeGraph(BaseModel): - entities: List[Entity] - relationships: List[Relationship] - -extraction_strategy = LLMExtractionStrategy( - provider='openai/gpt-4o-mini', - api_token=os.getenv('OPENAI_API_KEY'), - schema=KnowledgeGraph.model_json_schema(), - extraction_type="schema", - instruction="Extract entities and relationships from the given text." -) - -async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - url="https://paulgraham.com/love.html", - cache_mode=CacheMode.BYPASS, - extraction_strategy=extraction_strategy - ) -``` - -### Advanced Session-Based Crawling with Dynamic Content 🔄 - -For modern web applications with dynamic content loading, here's how to handle pagination and content updates: - -```python -async def crawl_dynamic_content(): - async with AsyncWebCrawler(verbose=True) as crawler: - url = "https://github.com/microsoft/TypeScript/commits/main" - session_id = "typescript_commits_session" - - js_next_page = """ - const button = document.querySelector('a[data-testid="pagination-next-button"]'); - if (button) button.click(); - """ - - wait_for = """() => { - const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4'); - if (commits.length === 0) return false; - const firstCommit = commits[0].textContent.trim(); - return firstCommit !== window.firstCommit; - }""" - - schema = { - "name": "Commit Extractor", - "baseSelector": "li.Box-sc-g0xbh4-0", - "fields": [ - { - "name": "title", - "selector": "h4.markdown-title", - "type": "text", - "transform": "strip", - }, - ], - } - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - - for page in range(3): # Crawl 3 pages - result = await crawler.arun( - url=url, - session_id=session_id, - css_selector="li.Box-sc-g0xbh4-0", - extraction_strategy=extraction_strategy, - js_code=js_next_page if page > 0 else None, - wait_for=wait_for if page > 0 else None, - js_only=page > 0, - cache_mode=CacheMode.BYPASS, - headless=False, - ) - - await crawler.crawler_strategy.kill_session(session_id) -``` - -### Handling Overlays and Fitting Content 📏 - -Remove overlay elements and fit content appropriately: - -```python -async with AsyncWebCrawler(headless=False) as crawler: - result = await crawler.arun( - url="your-url-here", - cache_mode=CacheMode.BYPASS, - word_count_threshold=10, - remove_overlay_elements=True, - screenshot=True - ) -``` - -## Performance Comparison 🏎️ - -Crawl4AI offers impressive performance compared to other solutions: - -```python -# Firecrawl comparison -from firecrawl import FirecrawlApp -app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY']) -start = time.time() -scrape_status = app.scrape_url( - 'https://www.nbcnews.com/business', - params={'formats': ['markdown', 'html']} -) -end = time.time() - -# Crawl4AI comparison -async with AsyncWebCrawler() as crawler: - start = time.time() - result = await crawler.arun( - url="https://www.nbcnews.com/business", - word_count_threshold=0, - cache_mode=CacheMode.BYPASS, - verbose=False, - ) - end = time.time() -``` - -Note: Performance comparisons should be conducted in environments with stable and fast internet connections for accurate results. - -## Congratulations! 🎉 - -You've made it through the updated Crawl4AI Quickstart Guide! Now you're equipped with even more powerful features to crawl the web asynchronously like a pro! 🕸️ - -Happy crawling! 🚀 \ No newline at end of file +For advanced examples (LLM strategies, knowledge graphs, pagination handling), ensure all code aligns with the `BrowserConfig` and `CrawlerRunConfig` pattern shown above. diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/basic/simple-crawling.md index 60d9663b..ec63984c 100644 --- a/docs/md_v2/basic/simple-crawling.md +++ b/docs/md_v2/basic/simple-crawling.md @@ -4,16 +4,21 @@ This guide covers the basics of web crawling with Crawl4AI. You'll learn how to ## Basic Usage -Here's the simplest way to crawl a webpage: +Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`: ```python import asyncio from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig async def main(): - async with AsyncWebCrawler() as crawler: + browser_config = BrowserConfig() # Default browser configuration + run_config = CrawlerRunConfig() # Default crawl run configuration + + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( - url="https://example.com" + url="https://example.com", + config=run_config ) print(result.markdown) # Print clean markdown content @@ -26,7 +31,10 @@ if __name__ == "__main__": The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details): ```python -result = await crawler.arun(url="https://example.com", fit_markdown=True) +result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig(fit_markdown=True) +) # Different content formats print(result.html) # Raw HTML @@ -45,16 +53,20 @@ print(result.links) # Dictionary of internal and external links ## Adding Basic Options -Customize your crawl with these common options: +Customize your crawl using `CrawlerRunConfig`: ```python -result = await crawler.arun( - url="https://example.com", +run_config = CrawlerRunConfig( word_count_threshold=10, # Minimum words per content block exclude_external_links=True, # Remove external links remove_overlay_elements=True, # Remove popups/modals process_iframes=True # Process iframe content ) + +result = await crawler.arun( + url="https://example.com", + config=run_config +) ``` ## Handling Errors @@ -62,7 +74,9 @@ result = await crawler.arun( Always check if the crawl was successful: ```python -result = await crawler.arun(url="https://example.com") +run_config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=run_config) + if not result.success: print(f"Crawl failed: {result.error_message}") print(f"Status code: {result.status_code}") @@ -70,36 +84,45 @@ if not result.success: ## Logging and Debugging -Enable verbose mode for detailed logging: +Enable verbose logging in `BrowserConfig`: ```python -async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://example.com") +browser_config = BrowserConfig(verbose=True) + +async with AsyncWebCrawler(config=browser_config) as crawler: + run_config = CrawlerRunConfig() + result = await crawler.arun(url="https://example.com", config=run_config) ``` ## Complete Example -Here's a more comprehensive example showing common usage patterns: +Here's a more comprehensive example demonstrating common usage patterns: ```python import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode async def main(): - async with AsyncWebCrawler(verbose=True) as crawler: + browser_config = BrowserConfig(verbose=True) + run_config = CrawlerRunConfig( + # Content filtering + word_count_threshold=10, + excluded_tags=['form', 'header'], + exclude_external_links=True, + + # Content processing + process_iframes=True, + remove_overlay_elements=True, + + # Cache control + cache_mode=CacheMode.ENABLED # Use cache if available + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun( url="https://example.com", - # Content filtering - word_count_threshold=10, - excluded_tags=['form', 'header'], - exclude_external_links=True, - - # Content processing - process_iframes=True, - remove_overlay_elements=True, - - # Cache control - cache_mode=CacheMode.ENABLED # Use cache if available + config=run_config ) if result.success: diff --git a/docs/md_v2/blog/articles/dockerize_hooks.md b/docs/md_v2/blog/articles/dockerize_hooks.md new file mode 100644 index 00000000..965388ee --- /dev/null +++ b/docs/md_v2/blog/articles/dockerize_hooks.md @@ -0,0 +1,46 @@ +## Introducing Event Streams and Interactive Hooks in Crawl4AI + +![event-driven-crawl](https://res.cloudinary.com/kidocode/image/upload/t_400x400/v1734344008/15bb8bbb-83ac-43ac-962d-3feb3e0c3bbf_2_tjmr4n.webp) + +In the near future, I’m planning to enhance Crawl4AI’s capabilities by introducing an event stream mechanism that will give clients deeper, real-time insights into the crawling process. Today, hooks are a powerful feature at the code level—they let developers define custom logic at key points in the crawl. However, when using Crawl4AI as a service (e.g., through a Dockerized API), there isn’t an easy way to interact with these hooks at runtime. + +**What’s Changing?** + +I’m working on a solution that will allow the crawler to emit a continuous stream of events, updating clients on the current crawling stage, encountered pages, and any decision points. This event stream could be exposed over a standardized protocol like Server-Sent Events (SSE) or WebSockets, enabling clients to “subscribe” and listen as the crawler works. + +**Interactivity Through Process IDs** + +A key part of this new design is the concept of a unique process ID for each crawl session. Imagine you’re listening to an event stream that informs you: +- The crawler just hit a certain page +- It triggered a hook and is now pausing for instructions + +With the event stream in place, you can send a follow-up request back to the server—referencing the unique process ID—to provide extra data, instructions, or parameters. This might include selecting which links to follow next, adjusting extraction strategies, or providing authentication tokens for a protected API. Once the crawler receives these instructions, it resumes execution with the updated context. + +```mermaid +sequenceDiagram + participant Client + participant Server + participant Crawler + + Client->>Server: Start crawl request + Server->>Crawler: Initiate crawl with Process ID + Crawler-->>Server: Event: Page hit + Server-->>Client: Stream: Page hit event + Client->>Server: Instruction for Process ID + Server->>Crawler: Update crawl with new instructions + Crawler-->>Server: Event: Crawl completed + Server-->>Client: Stream: Crawl completed +``` + +**Benefits for Developers and Users** + +1. **Fine-Grained Control**: Instead of predefining all logic upfront, you can dynamically guide the crawler in response to actual data and conditions encountered mid-crawl. +2. **Real-Time Insights**: Monitor progress, errors, or network bottlenecks as they happen, without waiting for the entire crawl to finish. +3. **Enhanced Collaboration**: Different team members or automated systems can watch the same crawl events and provide input, making the crawling process more adaptive and intelligent. + +**Next Steps** + +I’m currently exploring the best APIs, technologies, and patterns to make this vision a reality. My goal is to deliver a seamless developer experience—one that integrates with existing Crawl4AI workflows while offering new flexibility and power. + +Stay tuned for more updates as I continue building this feature out. In the meantime, I’d love to hear any feedback or suggestions you might have to help shape this interactive, event-driven future of web crawling with Crawl4AI. + diff --git a/docs/md_v2/blog/releases/0.4.1.md b/docs/md_v2/blog/releases/0.4.1.md index b02b758d..e770d0b2 100644 --- a/docs/md_v2/blog/releases/0.4.1.md +++ b/docs/md_v2/blog/releases/0.4.1.md @@ -37,11 +37,11 @@ Here’s how to turn it on: ```python crawler = AsyncPlaywrightCrawlerStrategy( - text_only=True # Set this to True to enable text-only crawling + text_mode=True # Set this to True to enable text-only crawling ) ``` -When `text_only=True`, the crawler automatically: +When `text_mode=True`, the crawler automatically: - Disables GPU processing. - Blocks image and JavaScript resources. - Reduces the viewport size to 800x600 (you can override this with `viewport_width` and `viewport_height`). diff --git a/docs/md_v2/extraction/overview.md b/docs/md_v2/extraction/overview.md index 53a8b87d..7c524475 100644 --- a/docs/md_v2/extraction/overview.md +++ b/docs/md_v2/extraction/overview.md @@ -169,6 +169,35 @@ llm_result = await crawler.arun( ) ``` + +## Input Formats +All extraction strategies support different input formats to give you more control over how content is processed: + +- **markdown** (default): Uses the raw markdown conversion of the HTML content. Best for general text extraction where HTML structure isn't critical. +- **html**: Uses the raw HTML content. Useful when you need to preserve HTML structure or extract data from specific HTML elements. +- **fit_markdown**: Uses the cleaned and filtered markdown content. Best for extracting relevant content while removing noise. Requires a markdown generator with content filter to be configured. + +To specify an input format: +```python +strategy = LLMExtractionStrategy( + input_format="html", # or "markdown" or "fit_markdown" + provider="openai/gpt-4", + instruction="Extract product information" +) +``` + +Note: When using "fit_markdown", ensure your CrawlerRunConfig includes a markdown generator with content filter: +```python +config = CrawlerRunConfig( + extraction_strategy=strategy, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() # Content filter goes here for fit_markdown + ) +) +``` + +If fit_markdown is requested but not available (no markdown generator or content filter), the system will automatically fall back to raw markdown with a warning. + ## Best Practices 1. **Choose the Right Strategy** diff --git a/docs/md_v3/tutorials/advanced-features.md b/docs/md_v3/tutorials/advanced-features.md new file mode 100644 index 00000000..16f85874 --- /dev/null +++ b/docs/md_v3/tutorials/advanced-features.md @@ -0,0 +1,329 @@ +# Advanced Features (Proxy, PDF, Screenshot, SSL, Headers, & Storage State) + +Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers: + +1. **Proxy Usage** +2. **Capturing PDFs & Screenshots** +3. **Handling SSL Certificates** +4. **Custom Headers** +5. **Session Persistence & Local Storage** + +> **Prerequisites** +> - You have a basic grasp of [AsyncWebCrawler Basics](./async-webcrawler-basics.md) +> - You know how to run or configure your Python environment with Playwright installed + +--- + +## 1. Proxy Usage + +If you need to route your crawl traffic through a proxy—whether for IP rotation, geo-testing, or privacy—Crawl4AI supports it via `BrowserConfig.proxy_config`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_cfg = BrowserConfig( + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "myuser", + "password": "mypass", + }, + headless=True + ) + crawler_cfg = CrawlerRunConfig( + verbose=True + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://www.whatismyip.com/", + config=crawler_cfg + ) + if result.success: + print("[OK] Page fetched via proxy.") + print("Page HTML snippet:", result.html[:200]) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points** +- **`proxy_config`** expects a dict with `server` and optional auth credentials. +- Many commercial proxies provide an HTTP/HTTPS “gateway” server that you specify in `server`. +- If your proxy doesn’t need auth, omit `username`/`password`. + +--- + +## 2. Capturing PDFs & Screenshots + +Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI can do both in one pass: + +```python +import os, asyncio +from base64 import b64decode +from crawl4ai import AsyncWebCrawler, CacheMode + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/List_of_common_misconceptions", + cache_mode=CacheMode.BYPASS, + pdf=True, + screenshot=True + ) + + if result.success: + # Save screenshot + if result.screenshot: + with open("wikipedia_screenshot.png", "wb") as f: + f.write(b64decode(result.screenshot)) + + # Save PDF + if result.pdf: + with open("wikipedia_page.pdf", "wb") as f: + f.write(b64decode(result.pdf)) + + print("[OK] PDF & screenshot captured.") + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Why PDF + Screenshot?** +- Large or complex pages can be slow or error-prone with “traditional” full-page screenshots. +- Exporting a PDF is more reliable for very long pages. Crawl4AI automatically converts the first PDF page into an image if you request both. + +**Relevant Parameters** +- **`pdf=True`**: Exports the current page as a PDF (base64-encoded in `result.pdf`). +- **`screenshot=True`**: Creates a screenshot (base64-encoded in `result.screenshot`). +- **`scan_full_page`** or advanced hooking can further refine how the crawler captures content. + +--- + +## 3. Handling SSL Certificates + +If you need to verify or export a site’s SSL certificate—for compliance, debugging, or data analysis—Crawl4AI can fetch it during the crawl: + +```python +import asyncio, os +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + tmp_dir = os.path.join(os.getcwd(), "tmp") + os.makedirs(tmp_dir, exist_ok=True) + + config = CrawlerRunConfig( + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + print("\nCertificate Information:") + print(f"Issuer (CN): {cert.issuer.get('CN', '')}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # Export in multiple formats: + cert.to_json(os.path.join(tmp_dir, "certificate.json")) + cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) + cert.to_der(os.path.join(tmp_dir, "certificate.der")) + + print("\nCertificate exported to JSON/PEM/DER in 'tmp' folder.") + else: + print("[ERROR] No certificate or crawl failed.") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points** +- **`fetch_ssl_certificate=True`** triggers certificate retrieval. +- `result.ssl_certificate` includes methods (`to_json`, `to_pem`, `to_der`) for saving in various formats (handy for server config, Java keystores, etc.). + +--- + +## 4. Custom Headers + +Sometimes you need to set custom headers (e.g., language preferences, authentication tokens, or specialized user-agent strings). You can do this in multiple ways: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + # Option 1: Set headers at the crawler strategy level + crawler1 = AsyncWebCrawler( + # The underlying strategy can accept headers in its constructor + crawler_strategy=None # We'll override below for clarity + ) + crawler1.crawler_strategy.update_user_agent("MyCustomUA/1.0") + crawler1.crawler_strategy.set_custom_headers({ + "Accept-Language": "fr-FR,fr;q=0.9" + }) + result1 = await crawler1.arun("https://www.example.com") + print("Example 1 result success:", result1.success) + + # Option 2: Pass headers directly to `arun()` + crawler2 = AsyncWebCrawler() + result2 = await crawler2.arun( + url="https://www.example.com", + headers={"Accept-Language": "es-ES,es;q=0.9"} + ) + print("Example 2 result success:", result2.success) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Notes** +- Some sites may react differently to certain headers (e.g., `Accept-Language`). +- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-anti-bot.md) or use `UserAgentGenerator`. + +--- + +## 5. Session Persistence & Local Storage + +Crawl4AI can preserve cookies and localStorage so you can continue where you left off—ideal for logging into sites or skipping repeated auth flows. + +### 5.1 `storage_state` + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + storage_dict = { + "cookies": [ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/", + "expires": 1699999999.0, + "httpOnly": False, + "secure": False, + "sameSite": "None" + } + ], + "origins": [ + { + "origin": "https://example.com", + "localStorage": [ + {"name": "token", "value": "my_auth_token"} + ] + } + ] + } + + # Provide the storage state as a dictionary to start "already logged in" + async with AsyncWebCrawler( + headless=True, + storage_state=storage_dict + ) as crawler: + result = await crawler.arun("https://example.com/protected") + if result.success: + print("Protected page content length:", len(result.html)) + else: + print("Failed to crawl protected page") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### 5.2 Exporting & Reusing State + +You can sign in once, export the browser context, and reuse it later—without re-entering credentials. + +- **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file. +- Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step. + +**See**: [Detailed session management tutorial](./hooks-custom.md#using-storage_state) or [Explanations → Browser Context & Managed Browser](../../explanations/browser-management.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages). + +--- + +## Putting It All Together + +Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs. + +```python +import os, asyncio +from base64 import b64decode +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + # 1. Browser config with proxy + headless + browser_cfg = BrowserConfig( + proxy_config={ + "server": "http://proxy.example.com:8080", + "username": "myuser", + "password": "mypass", + }, + headless=True, + ) + + # 2. Crawler config with PDF, screenshot, SSL, custom headers, and ignoring caches + crawler_cfg = CrawlerRunConfig( + pdf=True, + screenshot=True, + fetch_ssl_certificate=True, + cache_mode=CacheMode.BYPASS, + headers={"Accept-Language": "en-US,en;q=0.8"}, + storage_state="my_storage.json", # Reuse session from a previous sign-in + verbose=True, + ) + + # 3. Crawl + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://secure.example.com/protected", config=crawler_cfg) + + if result.success: + print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", []))) + + # Save PDF & screenshot + if result.pdf: + with open("result.pdf", "wb") as f: + f.write(b64decode(result.pdf)) + if result.screenshot: + with open("result.png", "wb") as f: + f.write(b64decode(result.screenshot)) + + # Check SSL cert + if result.ssl_certificate: + print("SSL Issuer CN:", result.ssl_certificate.issuer.get("CN", "")) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## Conclusion & Next Steps + +You’ve now explored several **advanced** features: + +- **Proxy Usage** +- **PDF & Screenshot** capturing for large or critical pages +- **SSL Certificate** retrieval & exporting +- **Custom Headers** for language or specialized requests +- **Session Persistence** via storage state + +**Where to go next**: + +- **[Hooks & Custom Code](./hooks-custom.md)**: For multi-step interactions (clicking “Load More,” performing logins, etc.) +- **[Identity-Based Crawling & Anti-Bot](./identity-anti-bot.md)**: If you need more sophisticated user simulation or stealth. +- **[Reference → BrowserConfig & CrawlerRunConfig](../../reference/configuration.md)**: Detailed param descriptions for everything you’ve seen here and more. + +With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline. + +**Last Updated**: 2024-XX-XX \ No newline at end of file diff --git a/docs/md_v3/tutorials/async-webcrawler-basics.md b/docs/md_v3/tutorials/async-webcrawler-basics.md new file mode 100644 index 00000000..46256eaa --- /dev/null +++ b/docs/md_v3/tutorials/async-webcrawler-basics.md @@ -0,0 +1,218 @@ +Below is a sample Markdown file (`tutorials/async-webcrawler-basics.md`) illustrating how you might teach new users the fundamentals of `AsyncWebCrawler`. This tutorial builds on the **Getting Started** section by introducing key configuration parameters and the structure of the crawl result. Feel free to adjust the code snippets, wording, or format to match your style. + +--- + +# AsyncWebCrawler Basics + +In this tutorial, you’ll learn how to: + +1. Create and configure an `AsyncWebCrawler` instance +2. Understand the `CrawlResult` object returned by `arun()` +3. Use basic `BrowserConfig` and `CrawlerRunConfig` options to tailor your crawl + +> **Prerequisites** +> - You’ve already completed the [Getting Started](./getting-started.md) tutorial (or have equivalent knowledge). +> - You have **Crawl4AI** installed and configured with Playwright. + +--- + +## 1. What is `AsyncWebCrawler`? + +`AsyncWebCrawler` is the central class for running asynchronous crawling operations in Crawl4AI. It manages browser sessions, handles dynamic pages (if needed), and provides you with a structured result object for each crawl. Essentially, it’s your high-level interface for collecting page data. + +```python +from crawl4ai import AsyncWebCrawler + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result) +``` + +--- + +## 2. Creating a Basic `AsyncWebCrawler` Instance + +Below is a simple code snippet showing how to create and use `AsyncWebCrawler`. This goes one step beyond the minimal example you saw in [Getting Started](./getting-started.md). + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai import BrowserConfig, CrawlerRunConfig + +async def main(): + # 1. Set up configuration objects (optional if you want defaults) + browser_config = BrowserConfig( + browser_type="chromium", + headless=True, + verbose=True + ) + crawler_config = CrawlerRunConfig( + page_timeout=30000, # 30 seconds + wait_for_images=True, + verbose=True + ) + + # 2. Initialize AsyncWebCrawler with your chosen browser config + async with AsyncWebCrawler(config=browser_config) as crawler: + # 3. Run a single crawl + url_to_crawl = "https://example.com" + result = await crawler.arun(url=url_to_crawl, config=crawler_config) + + # 4. Inspect the result + if result.success: + print(f"Successfully crawled: {result.url}") + print(f"HTML length: {len(result.html)}") + print(f"Markdown snippet: {result.markdown[:200]}...") + else: + print(f"Failed to crawl {result.url}. Error: {result.error_message}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Key Points + +1. **`BrowserConfig`** is optional, but it’s the place to specify browser-related settings (e.g., `headless`, `browser_type`). +2. **`CrawlerRunConfig`** deals with how you want the crawler to behave for this particular run (timeouts, waiting for images, etc.). +3. **`arun()`** is the main method to crawl a single URL. We’ll see how `arun_many()` works in later tutorials. + +--- + +## 3. Understanding `CrawlResult` + +When you call `arun()`, you get back a `CrawlResult` object containing all the relevant data from that crawl attempt. Some common fields include: + +```python +class CrawlResult(BaseModel): + url: str + html: str + success: bool + cleaned_html: Optional[str] = None + media: Dict[str, List[Dict]] = {} + links: Dict[str, List[Dict]] = {} + screenshot: Optional[str] = None # base64-encoded screenshot if requested + pdf: Optional[bytes] = None # binary PDF data if requested + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + markdown_v2: Optional[MarkdownGenerationResult] = None + error_message: Optional[str] = None + # ... plus other fields like status_code, ssl_certificate, extracted_content, etc. +``` + +### Commonly Used Fields + +- **`success`**: `True` if the crawl succeeded, `False` otherwise. +- **`html`**: The raw HTML (or final rendered state if JavaScript was executed). +- **`markdown` / `markdown_v2`**: Contains the automatically generated Markdown representation of the page. +- **`media`**: A dictionary with lists of extracted images, videos, or audio elements. +- **`links`**: A dictionary with lists of “internal” and “external” link objects. +- **`error_message`**: If `success` is `False`, this often contains a description of the error. + +**Example**: + +```python +if result.success: + print("Page Title or snippet of HTML:", result.html[:200]) + if result.markdown: + print("Markdown snippet:", result.markdown[:200]) + print("Links found:", len(result.links.get("internal", [])), "internal links") +else: + print("Error crawling:", result.error_message) +``` + +--- + +## 4. Relevant Basic Parameters + +Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might tweak early on. We’ll cover more advanced ones (like proxies, PDF, or screenshots) in later tutorials. + +### 4.1 `BrowserConfig` Essentials + +| Parameter | Description | Default | +|--------------------|-----------------------------------------------------------|----------------| +| `browser_type` | Which browser engine to use: `"chromium"`, `"firefox"`, `"webkit"` | `"chromium"` | +| `headless` | Run the browser with no UI window. If `False`, you see the browser. | `True` | +| `verbose` | Print extra logs for debugging. | `True` | +| `java_script_enabled` | Toggle JavaScript. When `False`, you might speed up loads but lose dynamic content. | `True` | + +### 4.2 `CrawlerRunConfig` Essentials + +| Parameter | Description | Default | +|-----------------------|--------------------------------------------------------------|--------------------| +| `page_timeout` | Maximum time in ms to wait for the page to load or scripts. | `30000` (30s) | +| `wait_for_images` | Wait for images to fully load. Good for accurate rendering. | `True` | +| `css_selector` | Target only certain elements for extraction. | `None` | +| `excluded_tags` | Skip certain HTML tags (like `nav`, `footer`, etc.) | `None` | +| `verbose` | Print logs for debugging. | `True` | + +> **Tip**: Don’t worry if you see lots of parameters. You’ll learn them gradually in later tutorials. + +--- + +## 5. Putting It All Together + +Here’s a slightly more in-depth example that shows off a few key config parameters at once: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai import BrowserConfig, CrawlerRunConfig + +async def main(): + browser_cfg = BrowserConfig( + browser_type="chromium", + headless=True, + java_script_enabled=True, + verbose=False + ) + + crawler_cfg = CrawlerRunConfig( + page_timeout=30000, # wait up to 30 seconds + wait_for_images=True, + css_selector=".article-body", # only extract content under this CSS selector + verbose=True + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://news.example.com", config=crawler_cfg) + + if result.success: + print("[OK] Crawled:", result.url) + print("HTML length:", len(result.html)) + print("Extracted Markdown:", result.markdown_v2.raw_markdown[:300]) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Observations**: +- `css_selector=".article-body"` ensures we only focus on the main content region. +- `page_timeout=30000` helps if the site is slow. +- We turned off `verbose` logs for the browser but kept them on for the crawler config. + +--- + +## 6. Next Steps + +- **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md). +- **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md). +- **Reference**: For a complete list of every parameter in `BrowserConfig` and `CrawlerRunConfig`, check out the [Reference section](../../reference/configuration.md). + +--- + +## Summary + +You now know the basics of **AsyncWebCrawler**: +- How to create it with optional browser/crawler configs +- How `arun()` works for single-page crawls +- Where to find your crawled data in `CrawlResult` +- A handful of frequently used configuration parameters + +From here, you can refine your crawler to handle more advanced scenarios, like focusing on specific content or dealing with dynamic elements. Let’s move on to **[Smart Crawling Techniques](./smart-crawling.md)** to learn how to handle iframes, advanced caching, and more. + +--- + +**Last updated**: 2024-XX-XX + +Keep exploring! If you get stuck, remember to check out the [How-To Guides](../../how-to/) for targeted solutions or the [Explanations](../../explanations/) for deeper conceptual background. \ No newline at end of file diff --git a/docs/md_v3/tutorials/docker-quickstart.md b/docs/md_v3/tutorials/docker-quickstart.md new file mode 100644 index 00000000..73070baa --- /dev/null +++ b/docs/md_v3/tutorials/docker-quickstart.md @@ -0,0 +1,271 @@ +# Deploying with Docker (Quickstart) + +> **⚠️ WARNING: Experimental & Legacy** +> Our current Docker solution for Crawl4AI is **not stable** and **will be discontinued** soon. A more robust Docker/Orchestration strategy is in development, with a planned stable release in **2025**. If you choose to use this Docker approach, please proceed cautiously and avoid production deployment without thorough testing. + +Crawl4AI is **open-source** and under **active development**. We appreciate your interest, but strongly recommend you make **informed decisions** if you need a production environment. Expect breaking changes in future versions. + +--- + +## 1. Installation & Environment Setup (Outside Docker) + +Before we jump into Docker usage, here’s a quick reminder of how to install Crawl4AI locally (legacy doc). For **non-Docker** deployments or local dev: + +```bash +# 1. Install the package +pip install crawl4ai +crawl4ai-setup + +# 2. Install playwright dependencies (all browsers or specific ones) +playwright install --with-deps +# or +playwright install --with-deps chromium +# or +playwright install --with-deps chrome +``` + +**Testing** your installation: + +```bash +# Visible browser test +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')" +``` + +--- + +## 2. Docker Overview + +This Docker approach allows you to run a **Crawl4AI** service via REST API. You can: + +1. **POST** a request (e.g., URLs, extraction config) +2. **Retrieve** your results from a task-based endpoint + +> **Note**: This Docker solution is **temporary**. We plan a more robust, stable Docker approach in the near future. For now, you can experiment, but do not rely on it for mission-critical production. + +--- + +## 3. Pulling and Running the Image + +### Basic Run + +```bash +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic +``` + +This starts a container on port `11235`. You can `POST` requests to `http://localhost:11235/crawl`. + +### Using an API Token + +```bash +docker run -p 11235:11235 \ + -e CRAWL4AI_API_TOKEN=your_secret_token \ + unclecode/crawl4ai:basic +``` + +If **`CRAWL4AI_API_TOKEN`** is set, you must include `Authorization: Bearer ` in your requests. Otherwise, the service is open to anyone. + +--- + +## 4. Docker Compose for Multi-Container Workflows + +You can also use **Docker Compose** to manage multiple services. Below is an **experimental** snippet: + +```yaml +version: '3.8' + +services: + crawl4ai: + image: unclecode/crawl4ai:basic + ports: + - "11235:11235" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + # Additional env variables as needed + volumes: + - /dev/shm:/dev/shm +``` + +To run: + +```bash +docker-compose up -d +``` + +And to stop: + +```bash +docker-compose down +``` + +**Troubleshooting**: + +- **Check logs**: `docker-compose logs -f crawl4ai` +- **Remove orphan containers**: `docker-compose down --remove-orphans` +- **Remove networks**: `docker network rm ` + +--- + +## 5. Making Requests to the Container + +**Base URL**: `http://localhost:11235` + +### Example: Basic Crawl + +```python +import requests + +task_request = { + "urls": "https://example.com", + "priority": 10 +} + +response = requests.post("http://localhost:11235/crawl", json=task_request) +task_id = response.json()["task_id"] + +# Poll for status +status_url = f"http://localhost:11235/task/{task_id}" +status = requests.get(status_url).json() +print(status) +``` + +If you used an API token, do: + +```python +headers = {"Authorization": "Bearer your_secret_token"} +response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json=task_request +) +``` + +--- + +## 6. Docker + New Crawler Config Approach + +### Using `BrowserConfig` & `CrawlerRunConfig` in Requests + +The Docker-based solution can accept **crawler configurations** in the request JSON (legacy doc might show direct parameters, but we want to embed them in `crawler_params` or `extra` to align with the new approach). For example: + +```python +import requests + +request_data = { + "urls": "https://www.nbcnews.com/business", + "crawler_params": { + "headless": True, + "browser_type": "chromium", + "verbose": True, + "page_timeout": 30000, + # ... any other BrowserConfig-like fields + }, + "extra": { + "word_count_threshold": 50, + "bypass_cache": True + } +} + +response = requests.post("http://localhost:11235/crawl", json=request_data) +task_id = response.json()["task_id"] +``` + +This is the recommended style if you want to replicate `BrowserConfig` and `CrawlerRunConfig` settings in Docker mode. + +--- + +## 7. Example: JSON Extraction in Docker + +```python +import requests +import json + +# Define a schema for CSS extraction +schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text" + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text" + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text" + } + ] +} + +request_data = { + "urls": "https://www.coinbase.com/explore", + "extraction_config": { + "type": "json_css", + "params": {"schema": schema} + }, + "crawler_params": { + "headless": True, + "verbose": True + } +} + +resp = requests.post("http://localhost:11235/crawl", json=request_data) +task_id = resp.json()["task_id"] + +# Poll for status +status = requests.get(f"http://localhost:11235/task/{task_id}").json() +if status["status"] == "completed": + extracted_content = status["result"]["extracted_content"] + data = json.loads(extracted_content) + print("Extracted:", len(data), "entries") +else: + print("Task still in progress or failed.") +``` + +--- + +## 8. Why This Docker Is Temporary + +**We are building a new, stable approach**: + +- The current Docker container is **experimental** and might break with future releases. +- We plan a stable release in **2025** with a more robust API, versioning, and orchestration. +- If you use this Docker in production, do so at your own risk and be prepared for **breaking changes**. + +**Community**: Because Crawl4AI is open-source, you can track progress or contribute to the new Docker approach. Check the [GitHub repository](https://github.com/unclecode/crawl4ai) for roadmaps and updates. + +--- + +## 9. Known Limitations & Next Steps + +1. **Not Production-Ready**: This Docker approach lacks extensive security, logging, or advanced config for large-scale usage. +2. **Ongoing Changes**: Expect API changes. The official stable version is targeted for **2025**. +3. **LLM Integrations**: Docker images are big if you want GPU or multiple model providers. We might unify these in a future build. +4. **Performance**: For concurrency or large crawls, you may need to tune resources (memory, CPU) and watch out for ephemeral storage. +5. **Version Pinning**: If you must deploy, pin your Docker tag to a specific version (e.g., `:basic-0.3.7`) to avoid surprise updates. + +### Next Steps + +- **Watch the Repository**: For announcements on the new Docker architecture. +- **Experiment**: Use this Docker for test or dev environments, but keep an eye out for breakage. +- **Contribute**: If you have ideas or improvements, open a PR or discussion. +- **Check Roadmaps**: See our [GitHub issues](https://github.com/unclecode/crawl4ai/issues) or [Roadmap doc](https://github.com/unclecode/crawl4ai/blob/main/ROADMAP.md) to find upcoming releases. + +--- + +## 10. Summary + +**Deploying with Docker** can simplify running Crawl4AI as a service. However: + +- **This Docker** approach is **legacy** and subject to removal/overhaul. +- For production, please weigh the risks carefully. +- Detailed “new Docker approach” is coming in **2025**. + +We hope this guide helps you do a quick spin-up of Crawl4AI in Docker for **experimental** usage. Stay tuned for the fully-supported version! \ No newline at end of file diff --git a/docs/md_v3/tutorials/getting-started.md b/docs/md_v3/tutorials/getting-started.md new file mode 100644 index 00000000..045590cb --- /dev/null +++ b/docs/md_v3/tutorials/getting-started.md @@ -0,0 +1,265 @@ +# Getting Started with Crawl4AI + +Welcome to **Crawl4AI**, an open-source LLM friendly Web Crawler & Scraper. In this tutorial, you’ll: + +1. **Install** Crawl4AI (both via pip and Docker, with notes on platform challenges). +2. Run your **first crawl** using minimal configuration. +3. Generate **Markdown** output (and learn how it’s influenced by content filters). +4. Experiment with a simple **CSS-based extraction** strategy. +5. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options). + +--- + +## 1. Introduction + +Crawl4AI provides: +- An asynchronous crawler, **`AsyncWebCrawler`**. +- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**. +- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports additional filters). +- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based). + +By the end of this guide, you’ll have installed Crawl4AI, performed a basic crawl, generated Markdown, and tried out two extraction strategies. + +--- + +## 2. Installation + +### 2.1 Python + Playwright + +#### Basic Pip Installation + +```bash +pip install crawl4ai +crawl4ai-setup +playwright install --with-deps +``` + +- **`crawl4ai-setup`** installs and configures Playwright (Chromium by default). + +We cover advanced installation and Docker in the [Installation](#installation) section. + +--- + +## 3. Your First Crawl + +Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) # Print first 300 chars + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s happening?** +- **`AsyncWebCrawler`** launches a headless browser (Chromium by default). +- It fetches `https://example.com`. +- Crawl4AI automatically converts the HTML into Markdown. + +You now have a simple, working crawl! + +--- + +## 4. Basic Configuration (Light Introduction) + +Crawl4AI’s crawler can be heavily customized using two main classes: + +1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.). +2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.). + +Below is an example with minimal usage: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_conf = BrowserConfig(headless=True) # or False to see the browser + run_conf = CrawlerRunConfig(cache_mode="BYPASS") + + async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_conf + ) + print(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling. + +--- + +## 5. Generating Markdown Output + +By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**. + +- **`result.markdown`**: + The direct HTML-to-Markdown conversion. +- **`result.markdown.fit_markdown`**: + The same content after applying any configured **content filter** (e.g., `PruningContentFilter`). + +### Example: Using a Filter with `DefaultMarkdownGenerator` + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +md_generator = DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed") +) + +config = CrawlerRunConfig(markdown_generator=md_generator) + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://news.ycombinator.com", config=config) + print("Raw Markdown length:", len(result.markdown.raw_markdown)) + print("Fit Markdown length:", len(result.markdown.fit_markdown)) +``` + +**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial. + +--- + +## 6. Simple Data Extraction (CSS-based) + +Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example: + +```python +import asyncio +import json +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def main(): + schema = { + "name": "Example Items", + "baseSelector": "div.item", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/items", + config=CrawlerRunConfig( + extraction_strategy=JsonCssExtractionStrategy(schema) + ) + ) + # The JSON output is stored in 'extracted_content' + data = json.loads(result.extracted_content) + print(data) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Why is this helpful?** +- Great for repetitive page structures (e.g., item listings, articles). +- No AI usage or costs. +- The crawler returns a JSON string you can parse or store. + +--- + +## 7. Simple Data Extraction (LLM-based) + +For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers: + +- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`) +- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`) +- Or any provider supported by the underlying library + +Below is an example using **open-source** style (no token) and closed-source: + +```python +import os +import json +import asyncio +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class PricingInfo(BaseModel): + model_name: str = Field(..., description="Name of the AI model") + input_fee: str = Field(..., description="Fee for input tokens") + output_fee: str = Field(..., description="Fee for output tokens") + +async def main(): + # 1) Open-Source usage: no token required + llm_strategy_open_source = LLMExtractionStrategy( + provider="ollama/llama3.3", # or "any-other-local-model" + api_token="no_token", # for local models, no API key is typically required + schema=PricingInfo.schema(), + extraction_type="schema", + instruction=""" + From this page, extract all AI model pricing details in JSON format. + Each entry should have 'model_name', 'input_fee', and 'output_fee'. + """, + temperature=0 + ) + + # 2) Closed-Source usage: API key for OpenAI, for example + openai_token = os.getenv("OPENAI_API_KEY", "sk-YOUR_API_KEY") + llm_strategy_openai = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=openai_token, + schema=PricingInfo.schema(), + extraction_type="schema", + instruction=""" + From this page, extract all AI model pricing details in JSON format. + Each entry should have 'model_name', 'input_fee', and 'output_fee'. + """, + temperature=0 + ) + + # We'll demo the open-source approach here + config = CrawlerRunConfig(extraction_strategy=llm_strategy_open_source) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/pricing", + config=config + ) + print("LLM-based extraction JSON:", result.extracted_content) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s happening?** +- We define a Pydantic schema (`PricingInfo`) describing the fields we want. +- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON. +- Depending on the **provider** and **api_token**, you can use local models or a remote API. + +--- + +## 8. Next Steps + +Congratulations! You have: +1. Installed Crawl4AI (via pip, with Docker as an option). +2. Performed a simple crawl and printed Markdown. +3. Seen how adding a **markdown generator** + **content filter** can produce “fit” Markdown. +4. Experimented with **CSS-based** extraction for repetitive data. +5. Learned the basics of **LLM-based** extraction (open-source and closed-source). + +If you are ready for more, check out: + +- **Installation**: Learn more on how to install Crawl4AI and set up Playwright. +- **Focus on Configuration**: Learn to customize browser settings, caching modes, advanced timeouts, etc. +- **Markdown Generation Basics**: Dive deeper into content filtering and “fit markdown” usage. +- **Dynamic Pages & Hooks**: Tackle sites with “Load More” buttons, login forms, or JavaScript complexities. +- **Deployment**: Run Crawl4AI in Docker containers and scale across multiple nodes. +- **Explanations & How-To Guides**: Explore browser contexts, identity-based crawling, hooking, performance, and more. + +Crawl4AI is a powerful tool for extracting data and generating Markdown from virtually any website. Enjoy exploring, and we hope you build amazing AI-powered applications with it! diff --git a/docs/md_v3/tutorials/getting-warmer.md b/docs/md_v3/tutorials/getting-warmer.md new file mode 100644 index 00000000..b2deb414 --- /dev/null +++ b/docs/md_v3/tutorials/getting-warmer.md @@ -0,0 +1,527 @@ +# Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling & AI Integration Solution + +Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extraction into AI-ready formats. Perfect for AI assistants, semantic search engines, or data pipelines, Crawl4AI transforms raw HTML into structured Markdown or JSON effortlessly. Integrate with LLMs, open-source models, or your own retrieval-augmented generation workflows. + +**What Crawl4AI is not:** + +Crawl4AI is not a replacement for traditional web scraping libraries, Selenium, or Playwright. It's not designed as a general-purpose web automation tool. Instead, Crawl4AI has a specific, focused goal: + +- To generate perfect, AI-friendly data (particularly for LLMs) from web content +- To maximize speed and efficiency in data extraction and processing +- To operate at scale, from Raspberry Pi to cloud infrastructures + +Crawl4AI is engineered with a "scale-first" mindset, aiming to handle millions of links while maintaining exceptional performance. It's super efficient and fast, optimized to: + +1. Transform raw web content into structured, LLM-ready formats (Markdown/JSON) +2. Implement intelligent extraction strategies to reduce reliance on costly API calls +3. Provide a streamlined pipeline for AI data preparation and ingestion + +In essence, Crawl4AI bridges the gap between web content and AI systems, focusing on delivering high-quality, processed data rather than offering broad web automation capabilities. + +**Key Links:** + +- **Website:** [https://crawl4ai.com](https://crawl4ai.com) +- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) +- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) +- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) + +--- + +## Table of Contents + +- [Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling \& AI Integration Solution](#crawl4ai-quick-start-guide-your-all-in-one-ai-ready-web-crawling--ai-integration-solution) + - [Table of Contents](#table-of-contents) + - [1. Introduction \& Key Concepts](#1-introduction--key-concepts) + - [2. Installation \& Environment Setup](#2-installation--environment-setup) + - [Test Your Installation](#test-your-installation) + - [3. Core Concepts \& Configuration](#3-core-concepts--configuration) + - [4. Basic Crawling \& Simple Extraction](#4-basic-crawling--simple-extraction) + - [5. Markdown Generation \& AI-Optimized Output](#5-markdown-generation--ai-optimized-output) + - [6. Structured Data Extraction (CSS, XPath, LLM)](#6-structured-data-extraction-css-xpath-llm) + - [7. Advanced Extraction: LLM \& Open-Source Models](#7-advanced-extraction-llm--open-source-models) + - [8. Page Interactions, JS Execution, \& Dynamic Content](#8-page-interactions-js-execution--dynamic-content) + - [9. Media, Links, \& Metadata Handling](#9-media-links--metadata-handling) + - [10. Authentication \& Identity Preservation](#10-authentication--identity-preservation) + - [Manual Setup via User Data Directory](#manual-setup-via-user-data-directory) + - [Using `storage_state`](#using-storage_state) + - [11. Proxy \& Security Enhancements](#11-proxy--security-enhancements) + - [12. Screenshots, PDFs \& File Downloads](#12-screenshots-pdfs--file-downloads) + - [13. Caching \& Performance Optimization](#13-caching--performance-optimization) + - [14. Hooks for Custom Logic](#14-hooks-for-custom-logic) + - [15. Dockerization \& Scaling](#15-dockerization--scaling) + - [16. Troubleshooting \& Common Pitfalls](#16-troubleshooting--common-pitfalls) + - [17. Comprehensive End-to-End Example](#17-comprehensive-end-to-end-example) + - [18. Further Resources \& Community](#18-further-resources--community) + +--- + +## 1. Introduction & Key Concepts + +Crawl4AI transforms websites into structured, AI-friendly data. It efficiently handles large-scale crawling, integrates with both proprietary and open-source LLMs, and optimizes content for semantic search or RAG pipelines. + +**Quick Test:** + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def test_run(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown) + +asyncio.run(test_run()) +``` + +If you see Markdown output, everything is working! + +**More info:** [See /docs/introduction](#) or [1_introduction.ex.md](https://github.com/unclecode/crawl4ai/blob/main/introduction.ex.md) + +--- + +## 2. Installation & Environment Setup + +```bash +# Install the package +pip install crawl4ai +crawl4ai-setup + +# Install Playwright with system dependencies (recommended) +playwright install --with-deps # Installs all browsers + +# Or install specific browsers: +playwright install --with-deps chrome # Recommended for Colab/Linux +playwright install --with-deps firefox +playwright install --with-deps webkit +playwright install --with-deps chromium + +# Keep Playwright updated periodically +playwright install +``` + +> **Note**: For Google Colab and some Linux environments, use `chrome` instead of `chromium` - it tends to work more reliably. + +### Test Your Installation +Try these one-liners: + +```python +# Visible browser test +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')" + +# Headless test (for servers/CI) +python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=True); page = browser.new_page(); page.goto('https://example.com'); print(f'Title: {page.title()}'); browser.close()" +``` + +You should see a browser window (in visible test) loading example.com. If you get errors, try with Firefox using `playwright install --with-deps firefox`. + + +**Try in Colab:** +[Open Colab Notebook](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing) + +**More info:** [See /docs/configuration](#) or [2_configuration.md](https://github.com/unclecode/crawl4ai/blob/main/configuration.md) + +--- + +## 3. Core Concepts & Configuration + +Use `AsyncWebCrawler`, `CrawlerRunConfig`, and `BrowserConfig` to control crawling. + +**Example config:** + +```python +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +browser_config = BrowserConfig( + headless=True, + verbose=True, + viewport_width=1080, + viewport_height=600, + text_mode=False, + ignore_https_errors=True, + java_script_enabled=True +) + +run_config = CrawlerRunConfig( + css_selector="article.main", + word_count_threshold=50, + excluded_tags=['nav','footer'], + exclude_external_links=True, + wait_for="css:.article-loaded", + page_timeout=60000, + delay_before_return_html=1.0, + mean_delay=0.1, + max_range=0.3, + process_iframes=True, + remove_overlay_elements=True, + js_code=""" + (async () => { + window.scrollTo(0, document.body.scrollHeight); + await new Promise(r => setTimeout(r, 2000)); + document.querySelector('.load-more')?.click(); + })(); + """ +) + +# Use: ENABLED, DISABLED, BYPASS, READ_ONLY, WRITE_ONLY +# run_config.cache_mode = CacheMode.ENABLED +``` + +**Prefixes:** + +- `http://` or `https://` for live pages +- `file://local.html` for local +- `raw:` for raw HTML strings + +**More info:** [See /docs/async_webcrawler](#) or [3_async_webcrawler.ex.md](https://github.com/unclecode/crawl4ai/blob/main/async_webcrawler.ex.md) + +--- + +## 4. Basic Crawling & Simple Extraction + +```python +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://news.example.com/article", config=run_config) + print(result.markdown) # Basic markdown content +``` + +**More info:** [See /docs/browser_context_page](#) or [4_browser_context_page.ex.md](https://github.com/unclecode/crawl4ai/blob/main/browser_context_page.ex.md) + +--- + +## 5. Markdown Generation & AI-Optimized Output + +After crawling, `result.markdown_v2` provides: + +- `raw_markdown`: Unfiltered markdown +- `markdown_with_citations`: Links as references at the bottom +- `references_markdown`: A separate list of reference links +- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25) +- `fit_html`: The HTML used to produce `fit_markdown` + +**Example:** + +```python +print("RAW:", result.markdown_v2.raw_markdown[:200]) +print("CITED:", result.markdown_v2.markdown_with_citations[:200]) +print("REFERENCES:", result.markdown_v2.references_markdown) +print("FIT MARKDOWN:", result.markdown_v2.fit_markdown) +``` + +For AI training, `fit_markdown` focuses on the most relevant content. + +**More info:** [See /docs/markdown_generation](#) or [5_markdown_generation.ex.md](https://github.com/unclecode/crawl4ai/blob/main/markdown_generation.ex.md) + +--- + +## 6. Structured Data Extraction (CSS, XPath, LLM) + +Extract JSON data without LLMs: + +**CSS:** + +```python +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +schema = { + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"} + ] +} +run_config.extraction_strategy = JsonCssExtractionStrategy(schema) +``` + +**XPath:** + +```python +from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy + +xpath_schema = { + "name": "Articles", + "baseSelector": "//div[@class='article']", + "fields": [ + {"name":"headline","selector":".//h1","type":"text"}, + {"name":"summary","selector":".//p[@class='summary']","type":"text"} + ] +} +run_config.extraction_strategy = JsonXPathExtractionStrategy(xpath_schema) +``` + +**More info:** [See /docs/extraction_strategies](#) or [7_extraction_strategies.ex.md](https://github.com/unclecode/crawl4ai/blob/main/extraction_strategies.ex.md) + +--- + +## 7. Advanced Extraction: LLM & Open-Source Models + +Use LLMExtractionStrategy for complex tasks. Works with OpenAI or open-source models (e.g., Ollama). + +```python +from pydantic import BaseModel +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class TravelData(BaseModel): + destination: str + attractions: list + +run_config.extraction_strategy = LLMExtractionStrategy( + provider="ollama/nemotron", + schema=TravelData.schema(), + instruction="Extract destination and top attractions." +) +``` + +**More info:** [See /docs/extraction_strategies](#) or [7_extraction_strategies.ex.md](https://github.com/unclecode/crawl4ai/blob/main/extraction_strategies.ex.md) + +--- + +## 8. Page Interactions, JS Execution, & Dynamic Content + +Insert `js_code` and use `wait_for` to ensure content loads. Example: + +```python +run_config.js_code = """ +(async () => { + document.querySelector('.load-more')?.click(); + await new Promise(r => setTimeout(r, 2000)); +})(); +""" +run_config.wait_for = "css:.item-loaded" +``` + +**More info:** [See /docs/page_interaction](#) or [11_page_interaction.md](https://github.com/unclecode/crawl4ai/blob/main/page_interaction.md) + +--- + +## 9. Media, Links, & Metadata Handling + +`result.media["images"]`: List of images with `src`, `score`, `alt`. Score indicates relevance. + +`result.media["videos"]`, `result.media["audios"]` similarly hold media info. + +`result.links["internal"]`, `result.links["external"]`, `result.links["social"]`: Categorized links. Each link has `href`, `text`, `context`, `type`. + +`result.metadata`: Title, description, keywords, author. + +**Example:** + +```python +# Images +for img in result.media["images"]: + print("Image:", img["src"], "Score:", img["score"], "Alt:", img.get("alt","N/A")) + +# Links +for link in result.links["external"]: + print("External Link:", link["href"], "Text:", link["text"]) + +# Metadata +print("Page Title:", result.metadata["title"]) +print("Description:", result.metadata["description"]) +``` + +**More info:** [See /docs/content_selection](#) or [8_content_selection.ex.md](https://github.com/unclecode/crawl4ai/blob/main/content_selection.ex.md) + +--- + +## 10. Authentication & Identity Preservation + +### Manual Setup via User Data Directory + +1. **Open Chrome with a custom user data dir:** + + ```bash + "C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile" + ``` + + On macOS: + + ```bash + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile" + ``` + +2. **Log in to sites, solve CAPTCHAs, adjust settings manually.** + The browser saves cookies/localStorage in that directory. + +3. **Use `user_data_dir` in `BrowserConfig`:** + + ```python + browser_config = BrowserConfig( + headless=True, + user_data_dir="/Users/username/ChromeProfiles/MyProfile" + ) + ``` + + Now the crawler starts with those cookies, sessions, etc. + +### Using `storage_state` + +Alternatively, export and reuse storage states: + +```python +browser_config = BrowserConfig( + headless=True, + storage_state="mystate.json" # Pre-saved state +) +``` + +No repeated logins needed. + +**More info:** [See /docs/storage_state](#) or [16_storage_state.md](https://github.com/unclecode/crawl4ai/blob/main/storage_state.md) + +--- + +## 11. Proxy & Security Enhancements + +Use `proxy_config` for authenticated proxies: + +```python +browser_config.proxy_config = { + "server": "http://proxy.example.com:8080", + "username": "proxyuser", + "password": "proxypass" +} +``` + +Combine with `headers` or `ignore_https_errors` as needed. + +**More info:** [See /docs/proxy_security](#) or [14_proxy_security.md](https://github.com/unclecode/crawl4ai/blob/main/proxy_security.md) + +--- + +## 12. Screenshots, PDFs & File Downloads + +Enable `screenshot=True` or `pdf=True` in `CrawlerRunConfig`: + +```python +run_config.screenshot = True +run_config.pdf = True +``` + +After crawling: + +```python +if result.screenshot: + with open("page.png", "wb") as f: + f.write(result.screenshot) + +if result.pdf: + with open("page.pdf", "wb") as f: + f.write(result.pdf) +``` + +**File Downloads:** + +```python +browser_config.accept_downloads = True +browser_config.downloads_path = "./downloads" +run_config.js_code = """document.querySelector('a.download')?.click();""" + +# After crawl: +print("Downloaded files:", result.downloaded_files) +``` + +**More info:** [See /docs/screenshot_and_pdf_export](#) or [15_screenshot_and_pdf_export.md](https://github.com/unclecode/crawl4ai/blob/main/screenshot_and_pdf_export.md) +Also [10_file_download.md](https://github.com/unclecode/crawl4ai/blob/main/file_download.md) + +--- + +## 13. Caching & Performance Optimization + +Set `cache_mode` to reuse fetch results: + +```python +from crawl4ai import CacheMode +run_config.cache_mode = CacheMode.ENABLED +``` + +Adjust delays, increase concurrency, or use `text_mode=True` for faster extraction. + +**More info:** [See /docs/cache_modes](#) or [9_cache_modes.md](https://github.com/unclecode/crawl4ai/blob/main/cache_modes.md) + +--- + +## 14. Hooks for Custom Logic + +Hooks let you run code at specific lifecycle events without creating pages manually in `on_browser_created`. + +Use `on_page_context_created` to apply routing or modify page contexts before crawling the URL: + +**Example Hook:** + +```python +async def on_page_context_created_hook(context, page, **kwargs): + # Block all images to speed up load + await context.route("**/*.{png,jpg,jpeg}", lambda route: route.abort()) + print("[HOOK] Image requests blocked") + +async with AsyncWebCrawler(config=browser_config) as crawler: + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created_hook) + result = await crawler.arun("https://imageheavy.example.com", config=run_config) + print("Crawl finished with images blocked.") +``` + +This hook is clean and doesn’t create a separate page itself—it just modifies the current context/page setup. + +**More info:** [See /docs/hooks_auth](#) or [13_hooks_auth.md](https://github.com/unclecode/crawl4ai/blob/main/hooks_auth.md) + +--- + +## 15. Dockerization & Scaling + +Use Docker images: + +- AMD64 basic: + +```bash +docker pull unclecode/crawl4ai:basic-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 +``` + +- ARM64 for M1/M2: + +```bash +docker pull unclecode/crawl4ai:basic-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 +``` + +- GPU support: + +```bash +docker pull unclecode/crawl4ai:gpu-amd64 +docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu-amd64 +``` + +Scale with load balancers or Kubernetes. + +**More info:** [See /docs/proxy_security (for proxy) or relevant Docker instructions in README](#) + +--- + +## 16. Troubleshooting & Common Pitfalls + +- Empty results? Relax filters, check selectors. +- Timeouts? Increase `page_timeout` or refine `wait_for`. +- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving. +- JS errors? Try headful mode for debugging. + +Check [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) & [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for more code. + +--- + +## 17. Comprehensive End-to-End Example + +Combine hooks, JS execution, PDF saving, LLM extraction—see [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for a full example. + +--- + +## 18. Further Resources & Community + +- **Docs:** [https://crawl4ai.com](https://crawl4ai.com) +- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues) + +Follow [@unclecode](https://x.com/unclecode) for news & community updates. + +**Happy Crawling!** +Leverage Crawl4AI to feed your AI models with clean, structured web data today. diff --git a/docs/md_v3/tutorials/hooks-custom.md b/docs/md_v3/tutorials/hooks-custom.md new file mode 100644 index 00000000..2f144065 --- /dev/null +++ b/docs/md_v3/tutorials/hooks-custom.md @@ -0,0 +1,335 @@ +# Hooks & Custom Code + +Crawl4AI supports a **hook** system that lets you run your own Python code at specific points in the crawling pipeline. By injecting logic into these hooks, you can automate tasks like: + +- **Authentication** (log in before navigating) +- **Content manipulation** (modify HTML, inject scripts, etc.) +- **Session or browser configuration** (e.g., adjusting user agents, local storage) +- **Custom data collection** (scrape extra details or track state at each stage) + +In this tutorial, you’ll learn about: + +1. What hooks are available +2. How to attach code to each hook +3. Practical examples (auth flows, user agent changes, content manipulation, etc.) + +> **Prerequisites** +> - Familiar with [AsyncWebCrawler Basics](./async-webcrawler-basics.md). +> - Comfortable with Python async/await. + +--- + +## 1. Overview of Available Hooks + +| Hook Name | Called When / Purpose | Context / Objects Provided | +|--------------------------|-----------------------------------------------------------------|-----------------------------------------------------| +| **`on_browser_created`** | Immediately after the browser is launched, but **before** any page or context is created. | **Browser** object only (no `page` yet). Use it for broad browser-level config. | +| **`on_page_context_created`** | Right after a new page context is created. Perfect for setting default timeouts, injecting scripts, etc. | Typically provides `page` and `context`. | +| **`on_user_agent_updated`** | Whenever the user agent changes. For advanced user agent logic or additional header updates. | Typically provides `page` and updated user agent string. | +| **`on_execution_started`** | Right before your main crawling logic runs (before rendering the page). Good for one-time setup or variable initialization. | Typically provides `page`, possibly `context`. | +| **`before_goto`** | Right before navigating to the URL (i.e., `page.goto(...)`). Great for setting cookies, altering the URL, or hooking in authentication steps. | Typically provides `page`, `context`, and `goto_params`. | +| **`after_goto`** | Immediately after navigation completes, but before scraping. For post-login checks or initial content adjustments. | Typically provides `page`, `context`, `response`. | +| **`before_retrieve_html`** | Right before retrieving or finalizing the page’s HTML content. Good for in-page manipulation (e.g., removing ads or disclaimers). | Typically provides `page` or final HTML reference. | +| **`before_return_html`** | Just before the HTML is returned to the crawler pipeline. Last chance to alter or sanitize content. | Typically provides final HTML or a `page`. | + +### A Note on `on_browser_created` (the “unbrowser” hook) +- **No `page`** object is available because no page context exists yet. You can, however, set up browser-wide properties. +- For example, you might control [CDP sessions][cdp] or advanced browser flags here. + +--- + +## 2. Registering Hooks + +You can attach hooks by calling: + +```python +crawler.crawler_strategy.set_hook("hook_name", your_hook_function) +``` + +or by passing a `hooks` dictionary to `AsyncWebCrawler` or your strategy constructor: + +```python +hooks = { + "before_goto": my_before_goto_hook, + "after_goto": my_after_goto_hook, + # ... etc. +} +async with AsyncWebCrawler(hooks=hooks) as crawler: + ... +``` + +### Hook Signature + +Each hook is a function (async or sync, depending on your usage) that receives **certain parameters**—most often `page`, `context`, or custom arguments relevant to that stage. The library then awaits or calls your hook before continuing. + +--- + +## 3. Real-Life Examples + +Below are concrete scenarios where hooks come in handy. + +--- + +### 3.1 Authentication Before Navigation + +One of the most frequent tasks is logging in or applying authentication **before** the crawler navigates to a URL (so that the user is recognized immediately). + +#### Using `before_goto` + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def before_goto_auth_hook(page, context, goto_params, **kwargs): + """ + Example: Set cookies or localStorage to simulate login. + This hook runs right before page.goto() is called. + """ + # Example: Insert cookie-based auth or local storage data + # (You could also do more complex actions, like fill forms if you already have a 'page' open.) + print("[HOOK] Setting auth data before goto.") + await context.add_cookies([ + { + "name": "session", + "value": "abcd1234", + "domain": "example.com", + "path": "/" + } + ]) + # Optionally manipulate goto_params if needed: + # goto_params["url"] = goto_params["url"] + "?debug=1" + +async def main(): + hooks = { + "before_goto": before_goto_auth_hook + } + + browser_cfg = BrowserConfig(headless=True) + crawler_cfg = CrawlerRunConfig() + + async with AsyncWebCrawler(config=browser_cfg, hooks=hooks) as crawler: + result = await crawler.arun(url="https://example.com/protected", config=crawler_cfg) + if result.success: + print("[OK] Logged in and fetched protected page.") + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Points** +- `before_goto` receives `page`, `context`, `goto_params` so you can add cookies, localStorage, or even change the URL itself. +- If you need to run a real login flow (submitting forms), consider `on_browser_created` or `on_page_context_created` if you want to do it once at the start. + +--- + +### 3.2 Setting Up the Browser in `on_browser_created` + +If you need to do advanced browser-level configuration (e.g., hooking into the Chrome DevTools Protocol, adjusting command-line flags, etc.), you’ll use `on_browser_created`. No `page` is available yet, but you can set up the **browser** instance itself. + +```python +async def on_browser_created_hook(browser, **kwargs): + """ + Runs immediately after the browser is created, before any pages. + 'browser' here is a Playwright Browser object. + """ + print("[HOOK] Browser created. Setting up custom stuff.") + # Possibly connect to DevTools or create an incognito context + # Example (pseudo-code): + # devtools_url = await browser.new_context(devtools=True) + +# Usage: +async with AsyncWebCrawler(hooks={"on_browser_created": on_browser_created_hook}) as crawler: + ... +``` + +--- + +### 3.3 Adjusting Page or Context in `on_page_context_created` + +If you’d like to set default timeouts or inject scripts right after a page context is spun up: + +```python +async def on_page_context_created_hook(page, context, **kwargs): + print("[HOOK] Page context created. Setting default timeouts or scripts.") + await page.set_default_timeout(20000) # 20 seconds + # Possibly inject a script or set user locale + +# Usage: +hooks = { + "on_page_context_created": on_page_context_created_hook +} +``` + +--- + +### 3.4 Dynamically Updating User Agents + +`on_user_agent_updated` is fired whenever the strategy updates the user agent. For instance, you might want to set certain cookies or console-log changes for debugging: + +```python +async def on_user_agent_updated_hook(page, context, new_ua, **kwargs): + print(f"[HOOK] User agent updated to {new_ua}") + # Maybe add a custom header based on new UA + await context.set_extra_http_headers({"X-UA-Source": new_ua}) + +hooks = { + "on_user_agent_updated": on_user_agent_updated_hook +} +``` + +--- + +### 3.5 Initializing Stuff with `on_execution_started` + +`on_execution_started` runs before your main crawling logic. It’s a good place for short, one-time setup tasks (like clearing old caches, or storing a timestamp). + +```python +async def on_execution_started_hook(page, context, **kwargs): + print("[HOOK] Execution started. Setting a start timestamp or logging.") + context.set_default_navigation_timeout(45000) # 45s if your site is slow + +hooks = { + "on_execution_started": on_execution_started_hook +} +``` + +--- + +### 3.6 Post-Processing with `after_goto` + +After the crawler finishes navigating (i.e., the page has presumably loaded), you can do additional checks or manipulations—like verifying you’re on the right page, or removing interstitials: + +```python +async def after_goto_hook(page, context, response, **kwargs): + """ + Called right after page.goto() finishes, but before the crawler extracts HTML. + """ + if response and response.ok: + print("[HOOK] After goto. Status:", response.status) + # Maybe remove popups or check if we landed on a login failure page. + await page.evaluate("""() => { + const popup = document.querySelector(".annoying-popup"); + if (popup) popup.remove(); + }""") + else: + print("[HOOK] Navigation might have failed, status not ok or no response.") + +hooks = { + "after_goto": after_goto_hook +} +``` + +--- + +### 3.7 Last-Minute Modifications in `before_retrieve_html` or `before_return_html` + +Sometimes you need to tweak the page or raw HTML right before it’s captured. + +```python +async def before_retrieve_html_hook(page, context, **kwargs): + """ + Modify the DOM just before the crawler finalizes the HTML. + """ + print("[HOOK] Removing adverts before capturing HTML.") + await page.evaluate("""() => { + const ads = document.querySelectorAll(".ad-banner"); + ads.forEach(ad => ad.remove()); + }""") + +async def before_return_html_hook(page, context, html, **kwargs): + """ + 'html' is the near-finished HTML string. Return an updated string if you like. + """ + # For example, remove personal data or certain tags from the final text + print("[HOOK] Sanitizing final HTML.") + sanitized_html = html.replace("PersonalInfo:", "[REDACTED]") + return sanitized_html + +hooks = { + "before_retrieve_html": before_retrieve_html_hook, + "before_return_html": before_return_html_hook +} +``` + +**Note**: If you want to make last-second changes in `before_return_html`, you can manipulate the `html` string directly. Return a new string if you want to override. + +--- + +## 4. Putting It All Together + +You can combine multiple hooks in a single run. For instance: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def on_browser_created_hook(browser, **kwargs): + print("[HOOK] Browser is up, no page yet. Good for broad config.") + +async def before_goto_auth_hook(page, context, goto_params, **kwargs): + print("[HOOK] Adding cookies for auth.") + await context.add_cookies([{"name": "session", "value": "abcd1234", "domain": "example.com"}]) + +async def after_goto_log_hook(page, context, response, **kwargs): + if response: + print("[HOOK] after_goto: Status code:", response.status) + +async def main(): + hooks = { + "on_browser_created": on_browser_created_hook, + "before_goto": before_goto_auth_hook, + "after_goto": after_goto_log_hook + } + + browser_cfg = BrowserConfig(headless=True) + crawler_cfg = CrawlerRunConfig(verbose=True) + + async with AsyncWebCrawler(config=browser_cfg, hooks=hooks) as crawler: + result = await crawler.arun("https://example.com/protected", config=crawler_cfg) + if result.success: + print("[OK] Protected page length:", len(result.html)) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +This example: + +1. **`on_browser_created`** sets up the brand-new browser instance. +2. **`before_goto`** ensures you inject an auth cookie before accessing the page. +3. **`after_goto`** logs the resulting HTTP status code. + +--- + +## 5. Common Pitfalls & Best Practices + +1. **Hook Order**: If multiple hooks do overlapping tasks (e.g., two `before_goto` hooks), be mindful of conflicts or repeated logic. +2. **Async vs Sync**: Some hooks might be used in a synchronous or asynchronous style. Confirm your function signature. If the crawler expects `async`, define `async def`. +3. **Mutating goto_params**: `goto_params` is a dict that eventually goes to Playwright’s `page.goto()`. Changing the `url` or adding extra fields can be powerful but can also lead to confusion. Document your changes carefully. +4. **Browser vs Page vs Context**: Not all hooks have both `page` and `context`. For example, `on_browser_created` only has access to **`browser`**. +5. **Avoid Overdoing It**: Hooks are powerful but can lead to complexity. If you find yourself writing massive code inside a hook, consider if a separate “how-to” function with a simpler approach might suffice. + +--- + +## Conclusion & Next Steps + +**Hooks** let you bend Crawl4AI to your will: + +- **Authentication** (cookies, localStorage) with `before_goto` +- **Browser-level config** with `on_browser_created` +- **Page or context config** with `on_page_context_created` +- **Content modifications** before capturing HTML (`before_retrieve_html` or `before_return_html`) + +**Where to go next**: + +- **[Identity-Based Crawling & Anti-Bot](./identity-anti-bot.md)**: Combine hooks with advanced user simulation to avoid bot detection. +- **[Reference → AsyncPlaywrightCrawlerStrategy](../../reference/browser-strategies.md)**: Learn more about how hooks are implemented under the hood. +- **[How-To Guides](../../how-to/)**: Check short, specific recipes for tasks like scraping multiple pages with repeated “Load More” clicks. + +With the hook system, you have near-complete control over the browser’s lifecycle—whether it’s setting up environment variables, customizing user agents, or manipulating the HTML. Enjoy the freedom to create sophisticated, fully customized crawling pipelines! + +**Last Updated**: 2024-XX-XX diff --git a/docs/md_v3/tutorials/json-extraction-basic.md b/docs/md_v3/tutorials/json-extraction-basic.md new file mode 100644 index 00000000..1a9b79e6 --- /dev/null +++ b/docs/md_v3/tutorials/json-extraction-basic.md @@ -0,0 +1,395 @@ +# Extracting JSON (No LLM) + +One of Crawl4AI’s **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM. + +**Why avoid LLM for basic extractions?** + +1. **Faster & Cheaper**: No API calls or GPU overhead. +2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free. +3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate. +4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel. + +Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**. + +--- + +## 1. Intro to Schema-Based Extraction + +A schema defines: + +1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card). +2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.). +3. **Nested** or **list** types for repeated or hierarchical structures. + +For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages. + +--- + +## 2. Simple Example: Crypto Prices + +Let’s begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don’t** call any LLM: + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +async def extract_crypto_prices(): + # 1. Define a simple extraction schema + schema = { + "name": "Crypto Prices", + "baseSelector": "div.crypto-row", # Repeated elements + "fields": [ + { + "name": "coin_name", + "selector": "h2.coin-name", + "type": "text" + }, + { + "name": "price", + "selector": "span.coin-price", + "type": "text" + } + ] + } + + # 2. Create the extraction strategy + extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) + + # 3. Set up your crawler config (if needed) + config = CrawlerRunConfig( + # e.g., pass js_code or wait_for if the page is dynamic + # wait_for="css:.crypto-row:nth-child(20)" + cache_mode = CacheMode.BYPASS, + extraction_strategy=extraction_strategy, + ) + + async with AsyncWebCrawler(verbose=True) as crawler: + # 4. Run the crawl and extraction + result = await crawler.arun( + url="https://example.com/crypto-prices", + + config=config + ) + + if not result.success: + print("Crawl failed:", result.error_message) + return + + # 5. Parse the extracted JSON + data = json.loads(result.extracted_content) + print(f"Extracted {len(data)} coin entries") + print(json.dumps(data[0], indent=2) if data else "No data found") + +asyncio.run(extract_crypto_prices()) +``` + +**Highlights**: + +- **`baseSelector`**: Tells us where each “item” (crypto row) is. +- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors. +- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.). + +No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items. + +--- + +### **XPath Example with `raw://` HTML** + +Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We’ll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`. + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy + +async def extract_crypto_prices_xpath(): + # 1. Minimal dummy HTML with some repeating rows + dummy_html = """ + + +
+

Bitcoin

+ $28,000 +
+
+

Ethereum

+ $1,800 +
+ + + """ + + # 2. Define the JSON schema (XPath version) + schema = { + "name": "Crypto Prices via XPath", + "baseSelector": "//div[@class='crypto-row']", + "fields": [ + { + "name": "coin_name", + "selector": ".//h2[@class='coin-name']", + "type": "text" + }, + { + "name": "price", + "selector": ".//span[@class='coin-price']", + "type": "text" + } + ] + } + + # 3. Place the strategy in the CrawlerRunConfig + config = CrawlerRunConfig( + extraction_strategy=JsonXPathExtractionStrategy(schema, verbose=True) + ) + + # 4. Use raw:// scheme to pass dummy_html directly + raw_url = f"raw://{dummy_html}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url=raw_url, + config=config + ) + + if not result.success: + print("Crawl failed:", result.error_message) + return + + data = json.loads(result.extracted_content) + print(f"Extracted {len(data)} coin rows") + if data: + print("First item:", data[0]) + +asyncio.run(extract_crypto_prices_xpath()) +``` + +**Key Points**: + +1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`. +2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS. +3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing. +4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**. + +That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`. + +--- + +## 3. Advanced Schema & Nested Structures + +Real sites often have **nested** or repeated data—like categories containing products, which themselves have a list of reviews or features. For that, we can define **nested** or **list** (and even **nested_list**) fields. + +### Sample E-Commerce HTML + +We have a **sample e-commerce** HTML file on GitHub (example): +``` +https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html +``` +This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**. + +```python +schema = { + "name": "E-commerce Product Catalog", + "baseSelector": "div.category", + # (1) We can define optional baseFields if we want to extract attributes from the category container + "baseFields": [ + {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, + ], + "fields": [ + { + "name": "category_name", + "selector": "h2.category-name", + "type": "text" + }, + { + "name": "products", + "selector": "div.product", + "type": "nested_list", # repeated sub-objects + "fields": [ + { + "name": "name", + "selector": "h3.product-name", + "type": "text" + }, + { + "name": "price", + "selector": "p.product-price", + "type": "text" + }, + { + "name": "details", + "selector": "div.product-details", + "type": "nested", # single sub-object + "fields": [ + {"name": "brand", "selector": "span.brand", "type": "text"}, + {"name": "model", "selector": "span.model", "type": "text"} + ] + }, + { + "name": "features", + "selector": "ul.product-features li", + "type": "list", + "fields": [ + {"name": "feature", "type": "text"} + ] + }, + { + "name": "reviews", + "selector": "div.review", + "type": "nested_list", + "fields": [ + {"name": "reviewer", "selector": "span.reviewer", "type": "text"}, + {"name": "rating", "selector": "span.rating", "type": "text"}, + {"name": "comment", "selector": "p.review-text", "type": "text"} + ] + }, + { + "name": "related_products", + "selector": "ul.related-products li", + "type": "list", + "fields": [ + {"name": "name", "selector": "span.related-name", "type": "text"}, + {"name": "price", "selector": "span.related-price", "type": "text"} + ] + } + ] + } + ] +} +``` + +Key Takeaways: + +- **Nested vs. List**: + - **`type: "nested"`** means a **single** sub-object (like `details`). + - **`type: "list"`** means multiple items that are **simple** dictionaries or single text fields. + - **`type: "nested_list"`** means repeated **complex** objects (like `products` or `reviews`). +- **Base Fields**: We can extract **attributes** from the container element via `"baseFields"`. For instance, `"data_cat_id"` might be `data-cat-id="elect123"`. +- **Transforms**: We can also define a `transform` if we want to lower/upper case, strip whitespace, or even run a custom function. + +### Running the Extraction + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +ecommerce_schema = { + # ... the advanced schema from above ... +} + +async def extract_ecommerce_data(): + strategy = JsonCssExtractionStrategy(ecommerce_schema, verbose=True) + + config = CrawlerRunConfig() + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html", + extraction_strategy=strategy, + config=config + ) + + if not result.success: + print("Crawl failed:", result.error_message) + return + + # Parse the JSON output + data = json.loads(result.extracted_content) + print(json.dumps(data, indent=2) if data else "No data found.") + +asyncio.run(extract_ecommerce_data()) +``` + +If all goes well, you get a **structured** JSON array with each “category,” containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM. + +--- + +## 4. Why “No LLM” Is Often Better + +1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not. +2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys. +3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling. +4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model. + +**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns. + +--- + +## 5. Base Element Attributes & Additional Fields + +It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using: + +```json +{ + "name": "href", + "type": "attribute", + "attribute": "href", + "default": null +} +``` + +You can define them in **`baseFields`** (extracted from the main container element) or in each field’s sub-lists. This is especially helpful if you need an item’s link or ID stored in the parent `
`. + +--- + +## 6. Putting It All Together: Larger Example + +Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author: + +```python +schema = { + "name": "Blog Posts", + "baseSelector": "a.blog-post-card", + "baseFields": [ + {"name": "post_url", "type": "attribute", "attribute": "href"} + ], + "fields": [ + {"name": "title", "selector": "h2.post-title", "type": "text", "default": "No Title"}, + {"name": "date", "selector": "time.post-date", "type": "text", "default": ""}, + {"name": "summary", "selector": "p.post-summary", "type": "text", "default": ""}, + {"name": "author", "selector": "span.post-author", "type": "text", "default": ""} + ] +} +``` + +Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post objects, each with `"post_url"`, `"title"`, `"date"`, `"summary"`, `"author"`. + +--- + +## 7. Tips & Best Practices + +1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors. +2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists. +3. **Test** your schema on partial HTML or a test page before a big crawl. +4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`. +5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings. +6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item. +7. **Performance**: For large pages, make sure your selectors are as narrow as possible. + +--- + +## 8. Conclusion + +With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that: + +- Scrape any consistent site for structured data. +- Support nested objects, repeating lists, or advanced transformations. +- Scale to thousands of pages quickly and reliably. + +**Next Steps**: + +- Explore the [Advanced Usage of JSON Extraction](../../explanations/extraction-chunking.md) for deeper details on schema nesting, transformations, or hooking. +- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed. +- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded. + +**Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI. + +**Last Updated**: 2024-XX-XX + +--- + +That’s it for **Extracting JSON (No LLM)**! You’ve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines! \ No newline at end of file diff --git a/docs/md_v3/tutorials/json-extraction-llm.md b/docs/md_v3/tutorials/json-extraction-llm.md new file mode 100644 index 00000000..5b9369d9 --- /dev/null +++ b/docs/md_v3/tutorials/json-extraction-llm.md @@ -0,0 +1,334 @@ +Below is a **draft** of the **Extracting JSON (LLM)** tutorial, illustrating how to use large language models for structured data extraction in Crawl4AI. It highlights key parameters (like chunking, overlap, instruction, schema) and explains how the system remains **provider-agnostic** via LightLLM. Adjust field names or code snippets to match your repository’s specifics. + +--- + +# Extracting JSON (LLM) + +In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that: + +1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more). +2. Automatically splits content into chunks (if desired) to handle token limits, then combines results. +3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach. + +**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./json-extraction-basic.md) or [`JsonXPathExtractionStrategy`](./json-extraction-basic.md) first. But if you need AI to interpret or reorganize content, read on! + +--- + +## 1. Why Use an LLM? + +- **Complex Reasoning**: If the site’s data is unstructured, scattered, or full of natural language context. +- **Semantic Extraction**: Summaries, knowledge graphs, or relational data that require comprehension. +- **Flexible**: You can pass instructions to the model to do more advanced transformations or classification. + +--- + +## 2. Provider-Agnostic via LightLLM + +Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide: + +- **`provider`**: The `/` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.). +- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it. +- **`api_base`** (optional): If your provider has a custom endpoint. + +This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily. + +--- + +## 3. How LLM Extraction Works + +### 3.1 Flow + +1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.). +2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples). +3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency). +4. **Combining**: The results from each chunk are merged and parsed into JSON. + +### 3.2 `extraction_type` + +- **`"schema"`**: The model tries to return JSON conforming to your Pydantic-based schema. +- **`"block"`**: The model returns freeform text, or smaller JSON structures, which the library collects. + +For structured data, `"schema"` is recommended. You provide `schema=YourPydanticModel.model_json_schema()`. + +--- + +## 4. Key Parameters + +Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`. + +1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`. +2. **`api_token`** (str): The API key or token for that model. May not be needed for local models. +3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`. +4. **`extraction_type`** (str): `"schema"` or `"block"`. +5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.” +6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM. +7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity. +8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`. +9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include: + - `"markdown"`: The raw markdown (default). + - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter. + - `"html"`: The cleaned or raw HTML. +10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc. +11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known). + +**Example**: + +```python +extraction_strategy = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token="YOUR_OPENAI_KEY", + schema=MyModel.model_json_schema(), + extraction_type="schema", + instruction="Extract a list of items from the text with 'name' and 'price' fields.", + chunk_token_threshold=1200, + overlap_rate=0.1, + apply_chunking=True, + input_format="html", + extra_args={"temperature": 0.1, "max_tokens": 1000}, + verbose=True +) +``` + +--- + +## 5. Putting It in `CrawlerRunConfig` + +**Important**: In Crawl4AI, all strategy definitions should go inside the `CrawlerRunConfig`, not directly as a param in `arun()`. Here’s a full example: + +```python +import os +import asyncio +import json +from pydantic import BaseModel, Field +from typing import List +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class Product(BaseModel): + name: str + price: str + +async def main(): + # 1. Define the LLM extraction strategy + llm_strategy = LLMExtractionStrategy( + provider="openai/gpt-4o-mini", # e.g. "ollama/llama2" + api_token=os.getenv('OPENAI_API_KEY'), + schema=Product.schema_json(), # Or use model_json_schema() + extraction_type="schema", + instruction="Extract all product objects with 'name' and 'price' from the content.", + chunk_token_threshold=1000, + overlap_rate=0.0, + apply_chunking=True, + input_format="markdown", # or "html", "fit_markdown" + extra_args={"temperature": 0.0, "max_tokens": 800} + ) + + # 2. Build the crawler config + crawl_config = CrawlerRunConfig( + extraction_strategy=llm_strategy, + cache_mode=CacheMode.BYPASS + ) + + # 3. Create a browser config if needed + browser_cfg = BrowserConfig(headless=True) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + # 4. Let's say we want to crawl a single page + result = await crawler.arun( + url="https://example.com/products", + config=crawl_config + ) + + if result.success: + # 5. The extracted content is presumably JSON + data = json.loads(result.extracted_content) + print("Extracted items:", data) + + # 6. Show usage stats + llm_strategy.show_usage() # prints token usage + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 6. Chunking Details + +### 6.1 `chunk_token_threshold` + +If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments. + +### 6.2 `overlap_rate` + +To keep context continuous across chunks, we can overlap them. E.g., `overlap_rate=0.1` means each subsequent chunk includes 10% of the previous chunk’s text. This is helpful if your needed info might straddle chunk boundaries. + +### 6.3 Performance & Parallelism + +By chunking, you can potentially process multiple chunks in parallel (depending on your concurrency settings and the LLM provider). This reduces total time if the site is huge or has many sections. + +--- + +## 7. Input Format + +By default, **LLMExtractionStrategy** uses `input_format="markdown"`, meaning the **crawler’s final markdown** is fed to the LLM. You can change to: + +- **`html`**: The cleaned HTML or raw HTML (depending on your crawler config) goes into the LLM. +- **`fit_markdown`**: If you used, for instance, `PruningContentFilter`, the “fit” version of the markdown is used. This can drastically reduce tokens if you trust the filter. +- **`markdown`**: Standard markdown output from the crawler’s `markdown_generator`. + +This setting is crucial: if the LLM instructions rely on HTML tags, pick `"html"`. If you prefer a text-based approach, pick `"markdown"`. + +```python +LLMExtractionStrategy( + # ... + input_format="html", # Instead of "markdown" or "fit_markdown" +) +``` + +--- + +## 8. Token Usage & Show Usage + +To keep track of tokens and cost, each chunk is processed with an LLM call. We record usage in: + +- **`usages`** (list): token usage per chunk or call. +- **`total_usage`**: sum of all chunk calls. +- **`show_usage()`**: prints a usage report (if the provider returns usage data). + +```python +llm_strategy = LLMExtractionStrategy(...) +# ... +llm_strategy.show_usage() +# e.g. “Total usage: 1241 tokens across 2 chunk calls” +``` + +If your model provider doesn’t return usage info, these fields might be partial or empty. + +--- + +## 9. Example: Building a Knowledge Graph + +Below is a snippet combining **`LLMExtractionStrategy`** with a Pydantic schema for a knowledge graph. Notice how we pass an **`instruction`** telling the model what to parse. + +```python +import os +import json +import asyncio +from typing import List +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +class Entity(BaseModel): + name: str + description: str + +class Relationship(BaseModel): + entity1: Entity + entity2: Entity + description: str + relation_type: str + +class KnowledgeGraph(BaseModel): + entities: List[Entity] + relationships: List[Relationship] + +async def main(): + # LLM extraction strategy + llm_strat = LLMExtractionStrategy( + provider="openai/gpt-4", + api_token=os.getenv('OPENAI_API_KEY'), + schema=KnowledgeGraph.schema_json(), + extraction_type="schema", + instruction="Extract entities and relationships from the content. Return valid JSON.", + chunk_token_threshold=1400, + apply_chunking=True, + input_format="html", + extra_args={"temperature": 0.1, "max_tokens": 1500} + ) + + crawl_config = CrawlerRunConfig( + extraction_strategy=llm_strat, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: + # Example page + url = "https://www.nbcnews.com/business" + result = await crawler.arun(url=url, config=crawl_config) + + if result.success: + with open("kb_result.json", "w", encoding="utf-8") as f: + f.write(result.extracted_content) + llm_strat.show_usage() + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Observations**: + +- **`extraction_type="schema"`** ensures we get JSON fitting our `KnowledgeGraph`. +- **`input_format="html"`** means we feed HTML to the model. +- **`instruction`** guides the model to output a structured knowledge graph. + +--- + +## 10. Best Practices & Caveats + +1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data. +2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential. +3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability. +4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error. +5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers. +6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup. + +--- + +## 11. Conclusion + +**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind: + +- Put your LLM strategy **in `CrawlerRunConfig`**. +- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees. +- Tweak **`chunk_token_threshold`**, **`overlap_rate`**, and **`apply_chunking`** to handle large content efficiently. +- Monitor token usage with `show_usage()`. + +If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./json-extraction-basic.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website. + +**Next Steps**: + +1. **Experiment with Different Providers** + - Try switching the `provider` (e.g., `"ollama/llama2"`, `"openai/gpt-4o"`, etc.) to see differences in speed, accuracy, or cost. + - Pass different `extra_args` like `temperature`, `top_p`, and `max_tokens` to fine-tune your results. + +2. **Combine With Other Strategies** + - Use [content filters](../../how-to/content-filters.md) like BM25 or Pruning prior to LLM extraction to remove noise and reduce token usage. + - Apply a [CSS or XPath extraction strategy](./json-extraction-basic.md) first for obvious, structured data, then send only the tricky parts to the LLM. + +3. **Performance Tuning** + - If pages are large, tweak `chunk_token_threshold`, `overlap_rate`, or `apply_chunking` to optimize throughput. + - Check the usage logs with `show_usage()` to keep an eye on token consumption and identify potential bottlenecks. + +4. **Validate Outputs** + - If using `extraction_type="schema"`, parse the LLM’s JSON with a Pydantic model for a final validation step. + - Log or handle any parse errors gracefully, especially if the model occasionally returns malformed JSON. + +5. **Explore Hooks & Automation** + - Integrate LLM extraction with [hooks](./hooks-custom.md) for complex pre/post-processing. + - Use a multi-step pipeline: crawl, filter, LLM-extract, then store or index results for further analysis. + +6. **Scale and Deploy** + - Combine your LLM extraction setup with [Docker or other deployment solutions](./docker-quickstart.md) to run at scale. + - Monitor memory usage and concurrency if you call LLMs frequently. + +**Last Updated**: 2024-XX-XX + +--- + +That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling! \ No newline at end of file diff --git a/docs/md_v3/tutorials/link-media-analysis.md b/docs/md_v3/tutorials/link-media-analysis.md new file mode 100644 index 00000000..229fad8d --- /dev/null +++ b/docs/md_v3/tutorials/link-media-analysis.md @@ -0,0 +1,295 @@ +Below is a **draft** of the **“Link & Media Analysis”** tutorial. It demonstrates how to access and filter links, handle domain restrictions, and manage media (especially images) using Crawl4AI’s configuration options. Feel free to adjust examples and text to match your exact workflow or preferences. + +--- + +# Link & Media Analysis + +In this tutorial, you’ll learn how to: + +1. Extract links (internal, external) from crawled pages +2. Filter or exclude specific domains (e.g., social media or custom domains) +3. Access and manage media data (especially images) in the crawl result +4. Configure your crawler to exclude or prioritize certain images + +> **Prerequisites** +> - You have completed or are familiar with the [AsyncWebCrawler Basics](./async-webcrawler-basics.md) tutorial. +> - You can run Crawl4AI in your environment (Playwright, Python, etc.). + +--- + +Below is a revised version of the **Link Extraction** and **Media Extraction** sections that includes example data structures showing how links and media items are stored in `CrawlResult`. Feel free to adjust any field names or descriptions to match your actual output. + +--- + +## 1. Link Extraction + +### 1.1 `result.links` + +When you call `arun()` or `arun_many()` on a URL, Crawl4AI automatically extracts links and stores them in the `links` field of `CrawlResult`. By default, the crawler tries to distinguish **internal** links (same domain) from **external** links (different domains). + +**Basic Example**: + +```python +from crawl4ai import AsyncWebCrawler + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://www.example.com") + if result.success: + internal_links = result.links.get("internal", []) + external_links = result.links.get("external", []) + print(f"Found {len(internal_links)} internal links, {len(external_links)} external links.") + + # Each link is typically a dictionary with fields like: + # { "href": "...", "text": "...", "title": "...", "base_domain": "..." } + if internal_links: + print("Sample Internal Link:", internal_links[0]) + else: + print("Crawl failed:", result.error_message) +``` + +**Structure Example**: + +```python +result.links = { + "internal": [ + { + "href": "https://kidocode.com/", + "text": "", + "title": "", + "base_domain": "kidocode.com" + }, + { + "href": "https://kidocode.com/degrees/technology", + "text": "Technology Degree", + "title": "KidoCode Tech Program", + "base_domain": "kidocode.com" + }, + # ... + ], + "external": [ + # possibly other links leading to third-party sites + ] +} +``` + +- **`href`**: The raw hyperlink URL. +- **`text`**: The link text (if any) within the `` tag. +- **`title`**: The `title` attribute of the link (if present). +- **`base_domain`**: The domain extracted from `href`. Helpful for filtering or grouping by domain. + +--- + +## 2. Domain Filtering + +Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are: + +- **`exclude_external_links`**: If `True`, discard any link pointing outside the root domain. +- **`exclude_social_media_domains`**: Provide a list of social media platforms (e.g., `["facebook.com", "twitter.com"]`) to exclude from your crawl. +- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms. +- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`). + +### 2.1 Example: Excluding External & Social Media Links + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + crawler_cfg = CrawlerRunConfig( + exclude_external_links=True, # No links outside primary domain + exclude_social_media_links=True # Skip recognized social media domains + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://www.example.com", + config=crawler_cfg + ) + if result.success: + print("[OK] Crawled:", result.url) + print("Internal links count:", len(result.links.get("internal", []))) + print("External links count:", len(result.links.get("external", []))) + # Likely zero external links in this scenario + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### 2.2 Example: Excluding Specific Domains + +If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this: + +```python +crawler_cfg = CrawlerRunConfig( + exclude_domains=["suspiciousads.com"] +) +``` + +This approach is handy when you still want external links but need to block certain sites you consider spammy. + +--- + +## 3. Media Extraction + +### 3.1 Accessing `result.media` + +By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`). + +**Basic Example**: + +```python +if result.success: + images_info = result.media.get("images", []) + print(f"Found {len(images_info)} images in total.") + for i, img in enumerate(images_info[:5]): # Inspect just the first 5 + print(f"[Image {i}] URL: {img['src']}") + print(f" Alt text: {img.get('alt', '')}") + print(f" Score: {img.get('score')}") + print(f" Description: {img.get('desc', '')}\n") +``` + +**Structure Example**: + +```python +result.media = { + "images": [ + { + "src": "https://cdn.prod.website-files.com/.../Group%2089.svg", + "alt": "coding school for kids", + "desc": "Trial Class Degrees degrees All Degrees AI Degree Technology ...", + "score": 3, + "type": "image", + "group_id": 0, + "format": None, + "width": None, + "height": None + }, + # ... + ], + "videos": [ + # Similar structure but with video-specific fields + ], + "audio": [ + # Similar structure but with audio-specific fields + ] +} +``` + +Depending on your Crawl4AI version or scraping strategy, these dictionaries can include fields like: + +- **`src`**: The media URL (e.g., image source) +- **`alt`**: The alt text for images (if present) +- **`desc`**: A snippet of nearby text or a short description (optional) +- **`score`**: A heuristic relevance score if you’re using content-scoring features +- **`width`**, **`height`**: If the crawler detects dimensions for the image/video +- **`type`**: Usually `"image"`, `"video"`, or `"audio"` +- **`group_id`**: If you’re grouping related media items, the crawler might assign an ID + +With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics. + +### 3.2 Excluding External Images + +If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on: + +```python +crawler_cfg = CrawlerRunConfig( + exclude_external_images=True +) +``` + +This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling. + +### 3.3 Additional Media Config + +- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`. +- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`. +- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction. + +--- + +## 4. Putting It All Together: Link & Media Filtering + +Here’s a combined example demonstrating how to filter out external links, skip certain domains, and exclude external images: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + # Suppose we want to keep only internal links, remove certain domains, + # and discard external images from the final crawl data. + crawler_cfg = CrawlerRunConfig( + exclude_external_links=True, + exclude_domains=["spammyads.com"], + exclude_social_media_links=True, # skip Twitter, Facebook, etc. + exclude_external_images=True, # keep only images from main domain + wait_for_images=True, # ensure images are loaded + verbose=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://www.example.com", config=crawler_cfg) + + if result.success: + print("[OK] Crawled:", result.url) + + # 1. Links + in_links = result.links.get("internal", []) + ext_links = result.links.get("external", []) + print("Internal link count:", len(in_links)) + print("External link count:", len(ext_links)) # should be zero with exclude_external_links=True + + # 2. Images + images = result.media.get("images", []) + print("Images found:", len(images)) + + # Let's see a snippet of these images + for i, img in enumerate(images[:3]): + print(f" - {img['src']} (alt={img.get('alt','')}, score={img.get('score','N/A')})") + else: + print("[ERROR] Failed to crawl. Reason:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 5. Common Pitfalls & Tips + +1. **Conflicting Flags**: + - `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant. + - `exclude_external_images=True` but want to keep some external images? Currently no partial domain-based setting for images, so you might need a custom approach or hook logic. + +2. **Relevancy Scores**: + - If your version of Crawl4AI or your scraping strategy includes an `img["score"]`, it’s typically a heuristic based on size, position, or content analysis. Evaluate carefully if you rely on it. + +3. **Performance**: + - Excluding certain domains or external images can speed up your crawl, especially for large, media-heavy pages. + - If you want a “full” link map, do *not* exclude them. Instead, you can post-filter in your own code. + +4. **Social Media Lists**: + - `exclude_social_media_links=True` typically references an internal list of known social domains like Facebook, Twitter, LinkedIn, etc. If you need to add or remove from that list, look for library settings or a local config file (depending on your version). + +--- + +## 6. Next Steps + +Now that you understand how to manage **Link & Media Analysis**, you can: + +- Fine-tune which links are stored or discarded in your final results +- Control which images (or other media) appear in `result.media` +- Filter out entire domains or social media platforms to keep your dataset relevant + +**Recommended Follow-Ups**: +- **[Advanced Features (Proxy, PDF, Screenshots)](./advanced-features.md)**: If you want to capture screenshots or save the page as a PDF for archival or debugging. +- **[Hooks & Custom Code](./hooks-custom.md)**: For more specialized logic, such as automated “infinite scroll” or repeated “Load More” button clicks. +- **Reference**: Check out [CrawlerRunConfig Reference](../../reference/configuration.md) for a comprehensive parameter list. + +**Last updated**: 2024-XX-XX + +--- + +**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project. \ No newline at end of file diff --git a/docs/md_v3/tutorials/markdown-basics.md b/docs/md_v3/tutorials/markdown-basics.md new file mode 100644 index 00000000..48498709 --- /dev/null +++ b/docs/md_v3/tutorials/markdown-basics.md @@ -0,0 +1,382 @@ +Below is a **draft** of the **Markdown Generation Basics** tutorial that incorporates your current Crawl4AI design and terminology. It introduces the default markdown generator, explains the concept of content filters (BM25 and Pruning), and covers the `MarkdownGenerationResult` object in a coherent, step-by-step manner. Adjust parameters or naming as needed to align with your actual codebase. + +--- + +# Markdown Generation Basics + +One of Crawl4AI’s core features is generating **clean, structured markdown** from web pages. Originally built to solve the problem of extracting only the “actual” content and discarding boilerplate or noise, Crawl4AI’s markdown system remains one of its biggest draws for AI workflows. + +In this tutorial, you’ll learn: + +1. How to configure the **Default Markdown Generator** +2. How **content filters** (BM25 or Pruning) help you refine markdown and discard junk +3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`) + +> **Prerequisites** +> - You’ve completed or read [AsyncWebCrawler Basics](./async-webcrawler-basics.md) to understand how to run a simple crawl. +> - You know how to configure `CrawlerRunConfig`. + +--- + +## 1. Quick Example + +Here’s a minimal code snippet that uses the **DefaultMarkdownGenerator** with no additional filtering: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator() + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success: + print("Raw Markdown Output:\n") + print(result.markdown) # The unfiltered markdown from the page + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**What’s happening?** +- `CrawlerRunConfig(markdown_generator=DefaultMarkdownGenerator())` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl. +- The resulting markdown is accessible via `result.markdown`. + +--- + +## 2. How Markdown Generation Works + +### 2.1 HTML-to-Text Conversion (Forked & Modified) + +Under the hood, **DefaultMarkdownGenerator** uses a specialized HTML-to-text approach that: + +- Preserves headings, code blocks, bullet points, etc. +- Removes extraneous tags (scripts, styles) that don’t add meaningful content. +- Can optionally generate references for links or skip them altogether. + +A set of **options** (passed as a dict) allows you to customize precisely how HTML converts to markdown. These map to standard html2text-like configuration plus your own enhancements (e.g., ignoring internal links, preserving certain tags verbatim, or adjusting line widths). + +### 2.2 Link Citations & References + +By default, the generator can convert `` elements into `[text][1]` citations, then place the actual links at the bottom of the document. This is handy for research workflows that demand references in a structured manner. + +### 2.3 Optional Content Filters + +Before or after the HTML-to-Markdown step, you can apply a **content filter** (like BM25 or Pruning) to reduce noise and produce a “fit_markdown”—a heavily pruned version focusing on the page’s main text. We’ll cover these filters shortly. + +--- + +## 3. Configuring the Default Markdown Generator + +You can tweak the output by passing an `options` dict to `DefaultMarkdownGenerator`. For example: + +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + # Example: ignore all links, don't escape HTML, and wrap text at 80 characters + md_generator = DefaultMarkdownGenerator( + options={ + "ignore_links": True, + "escape_html": False, + "body_width": 80 + } + ) + + config = CrawlerRunConfig( + markdown_generator=md_generator + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/docs", config=config) + if result.success: + print("Markdown:\n", result.markdown[:500]) # Just a snippet + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) +``` + +Some commonly used `options`: + +- **`ignore_links`** (bool): Whether to remove all hyperlinks in the final markdown. +- **`ignore_images`** (bool): Remove all `![image]()` references. +- **`escape_html`** (bool): Turn HTML entities into text (default is often `True`). +- **`body_width`** (int): Wrap text at N characters. `0` or `None` means no wrapping. +- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page. +- **`include_sup_sub`** (bool): Attempt to handle `` / `` in a more readable way. + +--- + +## 4. Content Filters + +**Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want. + +### 4.1 BM25ContentFilter + +If you have a **search query**, BM25 is a good choice: + +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai import CrawlerRunConfig + +bm25_filter = BM25ContentFilter( + user_query="machine learning", + bm25_threshold=1.2, + use_stemming=True +) + +md_generator = DefaultMarkdownGenerator( + content_filter=bm25_filter, + options={"ignore_links": True} +) + +config = CrawlerRunConfig(markdown_generator=md_generator) +``` + +- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query. +- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more. +- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”). + +**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results. + +### 4.2 PruningContentFilter + +If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections. + +```python +from crawl4ai.content_filter_strategy import PruningContentFilter + +prune_filter = PruningContentFilter( + threshold=0.5, + threshold_type="fixed", # or "dynamic" + min_word_threshold=50 +) +``` + +- **`threshold`**: Score boundary. Blocks below this score get removed. +- **`threshold_type`**: + - `"fixed"`: Straight comparison (`score >= threshold` keeps the block). + - `"dynamic"`: The filter adjusts threshold in a data-driven manner. +- **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful. + +**When to Use PruningContentFilter** +- You want a broad cleanup without a user query. +- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction. + +--- + +## 5. Using Fit Markdown + +When a content filter is active, the library produces two forms of markdown inside `result.markdown_v2` or (if using the simplified field) `result.markdown`: + +1. **`raw_markdown`**: The full unfiltered markdown. +2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments. + +**Note**: +- In earlier examples, you may see references to `result.markdown_v2`. Depending on your library version, you might access `result.markdown`, `result.markdown_v2`, or an object named `MarkdownGenerationResult`. The idea is the same: you’ll have a raw version and a filtered (“fit”) version if a filter is used. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter + +async def main(): + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"ignore_links": True} + ) + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://news.example.com/tech", config=config) + if result.success: + print("Raw markdown:\n", result.markdown) + + # If a filter is used, we also have .fit_markdown: + md_object = result.markdown_v2 # or your equivalent + print("Filtered markdown:\n", md_object.fit_markdown) + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +--- + +## 6. The `MarkdownGenerationResult` Object + +If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as: + +- **`raw_markdown`**: The direct HTML-to-markdown transformation (no filtering). +- **`markdown_with_citations`**: A version that moves links to reference-style footnotes. +- **`references_markdown`**: A separate string or section containing the gathered references. +- **`fit_markdown`**: The filtered markdown if you used a content filter. +- **`fit_html`**: The corresponding HTML snippet used to generate `fit_markdown` (helpful for debugging or advanced usage). + +**Example**: + +```python +md_obj = result.markdown_v2 # your library’s naming may vary +print("RAW:\n", md_obj.raw_markdown) +print("CITED:\n", md_obj.markdown_with_citations) +print("REFERENCES:\n", md_obj.references_markdown) +print("FIT:\n", md_obj.fit_markdown) +``` + +**Why Does This Matter?** +- You can supply `raw_markdown` to an LLM if you want the entire text. +- Or feed `fit_markdown` into a vector database to reduce token usage. +- `references_markdown` can help you keep track of link provenance. + +--- + +Below is a **revised section** under “Combining Filters (BM25 + Pruning)” that demonstrates how you can run **two** passes of content filtering without re-crawling, by taking the HTML (or text) from a first pass and feeding it into the second filter. It uses real code patterns from the snippet you provided for **BM25ContentFilter**, which directly accepts **HTML** strings (and can also handle plain text with minimal adaptation). + +--- + +## 7. Combining Filters (BM25 + Pruning) in Two Passes + +You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead: + +1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML). +2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query. + +### Two-Pass Example + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter +from bs4 import BeautifulSoup + +async def main(): + # 1. Crawl with minimal or no markdown generator, just get raw HTML + config = CrawlerRunConfig( + # If you only want raw HTML, you can skip passing a markdown_generator + # or provide one but focus on .html in this example + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/tech-article", config=config) + + if not result.success or not result.html: + print("Crawl failed or no HTML content.") + return + + raw_html = result.html + + # 2. First pass: PruningContentFilter on raw HTML + pruning_filter = PruningContentFilter(threshold=0.5, min_word_threshold=50) + + # filter_content returns a list of "text chunks" or cleaned HTML sections + pruned_chunks = pruning_filter.filter_content(raw_html) + # This list is basically pruned content blocks, presumably in HTML or text form + + # For demonstration, let's combine these chunks back into a single HTML-like string + # or you could do further processing. It's up to your pipeline design. + pruned_html = "\n".join(pruned_chunks) + + # 3. Second pass: BM25ContentFilter with a user query + bm25_filter = BM25ContentFilter( + user_query="machine learning", + bm25_threshold=1.2, + language="english" + ) + + bm25_chunks = bm25_filter.filter_content(pruned_html) # returns a list of text chunks + + if not bm25_chunks: + print("Nothing matched the BM25 query after pruning.") + return + + # 4. Combine or display final results + final_text = "\n---\n".join(bm25_chunks) + + print("==== PRUNED OUTPUT (first pass) ====") + print(pruned_html[:500], "... (truncated)") # preview + + print("\n==== BM25 OUTPUT (second pass) ====") + print(final_text[:500], "... (truncated)") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### What’s Happening? + +1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`. +2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**. +3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.) +4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.” + +**No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**. + +### Tips & Variations + +- **Plain Text vs. HTML**: If your pruned output is mostly text, BM25 can still handle it; just keep in mind it expects a valid string input. If you supply partial HTML (like `"

some text

"`), it will parse it as HTML. +- **Chaining in a Single Pipeline**: If your code supports it, you can chain multiple filters automatically. Otherwise, manual two-pass filtering (as shown) is straightforward. +- **Adjust Thresholds**: If you see too much or too little text in step one, tweak `threshold=0.5` or `min_word_threshold=50`. Similarly, `bm25_threshold=1.2` can be raised/lowered for more or fewer chunks in step two. + +### One-Pass Combination? + +If your codebase or pipeline design allows applying multiple filters in one pass, you could do so. But often it’s simpler—and more transparent—to run them sequentially, analyzing each step’s result. + +**Bottom Line**: By **manually chaining** your filtering logic in two passes, you get powerful incremental control over the final content. First, remove “global” clutter with Pruning, then refine further with BM25-based query relevance—without incurring a second network crawl. + +--- + +## 8. Common Pitfalls & Tips + +1. **No Markdown Output?** + - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements. + - Check if your content filter is too aggressive. Lower thresholds or disable the filter to see if content reappears. + +2. **Performance Considerations** + - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading. + - If your final use case is LLM ingestion, consider summarizing further or chunking big texts. + +3. **Take Advantage of `fit_markdown`** + - Great for RAG pipelines, semantic search, or any scenario where extraneous boilerplate is unwanted. + - Still verify the textual quality—some sites have crucial data in footers or sidebars. + +4. **Adjusting `html2text` Options** + - If you see lots of raw HTML slipping into the text, turn on `escape_html`. + - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`. + +--- + +## 9. Summary & Next Steps + +In this **Markdown Generation Basics** tutorial, you learned to: + +- Configure the **DefaultMarkdownGenerator** with HTML-to-text options. +- Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal. +- Distinguish between raw and filtered markdown (`fit_markdown`). +- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.). + +**Where to go from here**: + +- **[Extracting JSON (No LLM)](./json-extraction-basic.md)**: If you need structured data instead of markdown, check out the library’s JSON extraction strategies. +- **[Advanced Features](./advanced-features.md)**: Combine markdown generation with proxies, PDF exports, and more. +- **[Explanations → Content Filters vs. Extraction Strategies](../../explanations/extraction-chunking.md)**: Dive deeper into how filters differ from chunking or semantic extraction. + +Now you can produce high-quality Markdown from any website, focusing on exactly the content you need—an essential step for powering AI models, summarization pipelines, or knowledge-base queries. + +**Last Updated**: 2024-XX-XX + +--- + +That’s it for **Markdown Generation Basics**! Enjoy generating clean, noise-free markdown for your LLM workflows, content archives, or research. \ No newline at end of file diff --git a/docs/md_v3/tutorials/targeted-crawling.md b/docs/md_v3/tutorials/targeted-crawling.md new file mode 100644 index 00000000..f5fe2b77 --- /dev/null +++ b/docs/md_v3/tutorials/targeted-crawling.md @@ -0,0 +1,227 @@ +Below is a **draft** of a follow-up tutorial, **“Smart Crawling Techniques,”** building on the **“AsyncWebCrawler Basics”** tutorial. This tutorial focuses on three main points: + +1. **Advanced usage of CSS selectors** (e.g., partial extraction, exclusions) +2. **Handling iframes** (if relevant for your workflow) +3. **Waiting for dynamic content** using `wait_for`, including the new `css:` and `js:` prefixes + +Feel free to adjust code snippets, wording, or emphasis to match your library updates or user feedback. + +--- + +# Smart Crawling Techniques + +In the previous tutorial ([AsyncWebCrawler Basics](./async-webcrawler-basics.md)), you learned how to create an `AsyncWebCrawler` instance, run a basic crawl, and inspect the `CrawlResult`. Now it’s time to explore some of the **targeted crawling** features that let you: + +1. Select specific parts of a webpage using CSS selectors +2. Exclude or ignore certain page elements +3. Wait for dynamic content to load using `wait_for` (with `css:` or `js:` rules) +4. (Optionally) Handle iframes if your target site embeds additional content + +> **Prerequisites** +> - You’ve read or completed [AsyncWebCrawler Basics](./async-webcrawler-basics.md). +> - You have a working environment for Crawl4AI (Playwright installed, etc.). + +--- + +## 1. Targeting Specific Elements with CSS Selectors + +### 1.1 Simple CSS Selector Usage + +Let’s say you only need to crawl the main article content of a news page. By setting `css_selector` in `CrawlerRunConfig`, your final HTML or Markdown output focuses on that region. For example: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_cfg = BrowserConfig(headless=True) + crawler_cfg = CrawlerRunConfig( + css_selector=".article-body", # Only capture .article-body content + excluded_tags=["nav", "footer"] # Optional: skip big nav & footer sections + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://news.example.com/story/12345", + config=crawler_cfg + ) + if result.success: + print("[OK] Extracted content length:", len(result.html)) + else: + print("[ERROR]", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key Parameters**: +- **`css_selector`**: Tells the crawler to focus on `.article-body`. +- **`excluded_tags`**: Tells the crawler to skip specific HTML tags altogether (e.g., `nav` or `footer`). + +**Tip**: For extremely noisy pages, you can further refine how you exclude certain elements by using `excluded_selector`, which takes a CSS selector you want removed from the final output. + +### 1.2 Excluding Content with `excluded_selector` + +If you want to remove certain sections within `.article-body` (like “related stories” sidebars), set: + +```python +CrawlerRunConfig( + css_selector=".article-body", + excluded_selector=".related-stories, .ads-banner" +) +``` + +This combination grabs the main article content while filtering out sidebars or ads. + +--- + +## 2. Handling Iframes + +Some sites embed extra content via `