diff --git a/.do/app.yaml b/.do/app.yaml new file mode 100644 index 00000000..00d7b781 --- /dev/null +++ b/.do/app.yaml @@ -0,0 +1,19 @@ +alerts: +- rule: DEPLOYMENT_FAILED +- rule: DOMAIN_FAILED +name: crawl4ai +region: nyc +services: +- dockerfile_path: Dockerfile + github: + branch: 0.3.74 + deploy_on_push: true + repo: unclecode/crawl4ai + health_check: + http_path: /health + http_port: 11235 + instance_count: 1 + instance_size_slug: professional-xs + name: web + routes: + - path: / \ No newline at end of file diff --git a/.do/deploy.template.yaml b/.do/deploy.template.yaml new file mode 100644 index 00000000..9a06a366 --- /dev/null +++ b/.do/deploy.template.yaml @@ -0,0 +1,22 @@ +spec: + name: crawl4ai + services: + - name: crawl4ai + git: + branch: 0.3.74 + repo_clone_url: https://github.com/unclecode/crawl4ai.git + dockerfile_path: Dockerfile + http_port: 11235 + instance_count: 1 + instance_size_slug: professional-xs + health_check: + http_path: /health + envs: + - key: INSTALL_TYPE + value: "basic" + - key: PYTHON_VERSION + value: "3.10" + - key: ENABLE_GPU + value: "false" + routes: + - path: / \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4c3e151e..8e96fa82 100644 --- a/.gitignore +++ b/.gitignore @@ -199,6 +199,7 @@ test_env/ **/.DS_Store todo.md +todo_executor.md git_changes.py git_changes.md pypi_build.sh @@ -208,4 +209,8 @@ git_issues.md .tests/ .issues/ .docs/ -.issues/ \ No newline at end of file +.issues/ +.gitboss/ +todo_executor.md +protect-all-except-feature.sh +manage-collab.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 583c7807..8e5cc91a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,255 @@ # Changelog -# CHANGELOG +## [0.3.74] November 17, 2024 + +This changelog details the updates and changes introduced in Crawl4AI version 0.3.74. It's designed to inform developers about new features, modifications to existing components, removals, and other important information. + +### 1. File Download Processing + +- Users can now specify download folders using the `downloads_path` parameter in the `AsyncWebCrawler` constructor or the `arun` method. If not specified, downloads are saved to a "downloads" folder within the `.crawl4ai` directory. +- File download tracking is integrated into the `CrawlResult` object. Successfully downloaded files are listed in the `downloaded_files` attribute, providing their paths. +- Added `accept_downloads` parameter to the crawler strategies (defaults to `False`). If set to True you can add JS code and `wait_for` parameter for file download. + +**Example:** + +```python +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler + +async def download_example(): + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { downloadLink.click(); } + """, + wait_for=5 # To ensure download has started + ) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + +asyncio.run(download_example()) + +``` + +### 2. Refined Content Filtering + +- Introduced the `RelevanceContentFilter` strategy (and its implementation `BM25ContentFilter`) for extracting relevant content from web pages, replacing Fit Markdown and other content cleaning strategy. This new strategy leverages the BM25 algorithm to identify chunks of text relevant to the page's title, description, keywords, or a user-provided query. +- The `fit_markdown` flag in the content scraper is used to filter content based on title, meta description, and keywords. + +**Example:** + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) + print(result.extracted_content) # Or result.fit_markdown for the markdown version + print(result.fit_html) # Or result.fit_html to show HTML with only the filtered content + +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) +``` + +### 3. Raw HTML and Local File Support + +- Added support for crawling local files and raw HTML content directly. +- Use the `file://` prefix for local file paths. +- Use the `raw:` prefix for raw HTML strings. + +**Example:** + +```python +async def crawl_local_or_raw(crawler, content, content_type): + prefix = "file://" if content_type == "local" else "raw:" + url = f"{prefix}{content}" + result = await crawler.arun(url=url) + if result.success: + print(f"Markdown Content from {content_type.title()} Source:") + print(result.markdown) + +# Example usage with local file and raw HTML +async def main(): + async with AsyncWebCrawler() as crawler: + # Local File + await crawl_local_or_raw( + crawler, os.path.abspath('tests/async/sample_wikipedia.html'), "local" + ) + # Raw HTML + await crawl_raw_html(crawler, "

Raw Test

This is raw HTML.

") + + +asyncio.run(main()) +``` + +### 4. Browser Management + +- New asynchronous crawler strategy implemented using Playwright. +- `ManagedBrowser` class introduced for improved browser session handling, offering features like persistent browser sessions between requests (using `session_id` parameter) and browser process monitoring. +- Updated to tf-playwright-stealth for enhanced stealth capabilities. +- Added `use_managed_browser`, `use_persistent_context`, and `chrome_channel` parameters to AsyncPlaywrightCrawlerStrategy. + + +**Example:** +```python +async def browser_management_demo(): + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "user-data-dir") + os.makedirs(user_data_dir, exist_ok=True) # Ensure directory exists + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + use_persistent_context=True, + verbose=True + ) as crawler: + result1 = await crawler.arun( + url="https://example.com", session_id="my_session" + ) + result2 = await crawler.arun( + url="https://example.com/anotherpage", session_id="my_session" + ) + +asyncio.run(browser_management_demo()) +``` + + +### 5. API Server & Cache Improvements + +- Added CORS support to API server. +- Implemented static file serving. +- Enhanced root redirect functionality. +- Cache database updated to store response headers and downloaded files information. It utilizes a file system approach to manage large content efficiently. +- New, more efficient caching database built using xxhash and file system approach. +- Introduced `CacheMode` enum (`ENABLED`, `DISABLED`, `READ_ONLY`, `WRITE_ONLY`, `BYPASS`) and `always_bypass_cache` parameter in AsyncWebCrawler for fine-grained cache control. This replaces `bypass_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. + + +### 🗑️ Removals + +- Removed deprecated: `crawl4ai/content_cleaning_strategy.py`. +- Removed internal class ContentCleaningStrategy +- Removed legacy cache control flags: `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`, and `always_by_pass_cache`. These have been superseded by `cache_mode`. + + +### ⚙️ Other Changes + +- Moved version file to `crawl4ai/__version__.py`. +- Added `crawl4ai/cache_context.py`. +- Added `crawl4ai/version_manager.py`. +- Added `crawl4ai/migrations.py`. +- Added `crawl4ai-migrate` entry point. +- Added config `NEED_MIGRATION` and `SHOW_DEPRECATION_WARNINGS`. +- API server now requires an API token for authentication, configurable with the `CRAWL4AI_API_TOKEN` environment variable. This enhances API security. +- Added synchronous crawl endpoint `/crawl_sync` for immediate result retrieval, and direct crawl endpoint `/crawl_direct` bypassing the task queue. + + +### ⚠️ Deprecation Notices + +- The synchronous version of `WebCrawler` is being phased out. While still available via `crawl4ai[sync]`, it will eventually be removed. Transition to `AsyncWebCrawler` is strongly recommended. Boolean cache control flags in `arun` are also deprecated, migrate to using the `cache_mode` parameter. See examples in the "New Features" section above for correct usage. + + +### 🐛 Bug Fixes + +- Resolved issue with browser context closing unexpectedly in Docker. This significantly improves stability, particularly within containerized environments. +- Fixed memory leaks associated with incorrect asynchronous cleanup by removing the `__del__` method and ensuring the browser context is closed explicitly using context managers. +- Improved error handling in `WebScrapingStrategy`. More detailed error messages and suggestions for debugging will minimize frustration when running into unexpected issues. +- Fixed issue with incorrect text parsing in specific HTML structures. + + +### Example of migrating to the new CacheMode: + +**Old way:** + +```python +crawler = AsyncWebCrawler(always_by_pass_cache=True) +result = await crawler.arun(url="https://example.com", bypass_cache=True) +``` + +**New way:** + +```python +from crawl4ai import CacheMode + +crawler = AsyncWebCrawler(always_bypass_cache=True) +result = await crawler.arun(url="https://example.com", cache_mode=CacheMode.BYPASS) +``` + + +## [0.3.74] - November 13, 2024 + +1. **File Download Processing** (Nov 14, 2024) + - Added capability for users to specify download folders + - Implemented file download tracking in crowd result object + - Created new file: `tests/async/test_async_doanloader.py` + +2. **Content Filtering Improvements** (Nov 14, 2024) + - Introduced Relevance Content Filter as an improvement over Fit Markdown + - Implemented BM25 algorithm for content relevance matching + - Added new file: `crawl4ai/content_filter_strategy.py` + - Removed deprecated: `crawl4ai/content_cleaning_strategy.py` + +3. **Local File and Raw HTML Support** (Nov 13, 2024) + - Added support for processing local files + - Implemented raw HTML input handling in AsyncWebCrawler + - Enhanced `crawl4ai/async_webcrawler.py` with significant performance improvements + +4. **Browser Management Enhancements** (Nov 12, 2024) + - Implemented new async crawler strategy using Playwright + - Introduced ManagedBrowser for better browser session handling + - Added support for persistent browser sessions + - Updated from playwright_stealth to tf-playwright-stealth + +5. **API Server Component** + - Added CORS support + - Implemented static file serving + - Enhanced root redirect functionality + + + +## [0.3.731] - November 13, 2024 + +### Added +- Support for raw HTML and local file crawling via URL prefixes ('raw:', 'file://') +- Browser process monitoring for managed browser instances +- Screenshot capability for raw HTML and local file content +- Response headers storage in cache database +- New `fit_markdown` flag for optional markdown generation + +### Changed +- Switched HTML parser from 'html.parser' to 'lxml' for ~4x performance improvement +- Optimized BeautifulSoup text conversion and element selection +- Pre-compiled regular expressions for better performance +- Improved metadata extraction efficiency +- Response headers now stored alongside HTML in cache + +### Removed +- `__del__` method from AsyncPlaywrightCrawlerStrategy to prevent async cleanup issues + +### Fixed +- Issue #256: Added support for crawling raw HTML content +- Issue #253: Implemented file:// protocol handling +- Missing response headers in cached results +- Memory leaks from improper async cleanup + +## [v0.3.731] - 2024-11-13 Changelog for Issue 256 Fix +- Fixed: Browser context unexpectedly closing in Docker environment during crawl operations. +- Removed: __del__ method from AsyncPlaywrightCrawlerStrategy to prevent unreliable asynchronous cleanup, ensuring - browser context is closed explicitly within context managers. +- Added: Monitoring for ManagedBrowser subprocess to detect and log unexpected terminations. +- Updated: Dockerfile configurations to expose debugging port (9222) and allocate additional shared memory for improved browser stability. +- Improved: Error handling and resource cleanup processes for browser lifecycle management within the Docker environment. ## [v0.3.73] - 2024-11-05 @@ -70,7 +319,7 @@ - Modified database connection management approach - Updated API response structure for better consistency -## Migration Guide +### Migration Guide When upgrading to v0.3.73, be aware of the following changes: 1. Docker Deployment: @@ -92,7 +341,7 @@ When upgrading to v0.3.73, be aware of the following changes: - Follow recommended fixes for any identified problems -## [2024-11-04 - 13:21:42] Comprehensive Update of Crawl4AI Features and Dependencies +## [v0.3.73] - 2024-11-04 This commit introduces several key enhancements, including improved error handling and robust database operations in `async_database.py`, which now features a connection pool and retry logic for better reliability. Updates to the README.md provide clearer instructions and a better user experience with links to documentation sections. The `.gitignore` file has been refined to include additional directories, while the async web crawler now utilizes a managed browser for more efficient crawling. Furthermore, multiple dependency updates and introduction of the `CustomHTML2Text` class enhance text extraction capabilities. ## [v0.3.73] - 2024-10-24 @@ -180,7 +429,7 @@ This commit introduces several key enhancements, including improved error handli ## [v0.3.72] - 2024-10-20 ### Fixed -- Added support for parsing Base64 encoded images in WebScrappingStrategy +- Added support for parsing Base64 encoded images in WebScrapingStrategy ### Added - Forked and integrated a customized version of the html2text library for more control over Markdown generation @@ -203,7 +452,7 @@ This commit introduces several key enhancements, including improved error handli ### Developer Notes - The customized html2text library is now located within the crawl4ai package - New configuration options are available in the `config.py` file for external content handling -- The `WebScrappingStrategy` class has been updated to accommodate new external content exclusion options +- The `WebScrapingStrategy` class has been updated to accommodate new external content exclusion options ## [v0.3.71] - 2024-10-19 @@ -280,7 +529,7 @@ These updates aim to provide more flexibility in text processing, improve perfor ### Improvements 1. **Better Error Handling**: - - Enhanced error reporting in WebScrappingStrategy with detailed error messages and suggestions. + - Enhanced error reporting in WebScrapingStrategy with detailed error messages and suggestions. - Added console message and error logging for better debugging. 2. **Image Processing Enhancements**: @@ -338,43 +587,43 @@ These updates aim to provide more flexibility in text processing, improve perfor - Allows retrieval of content after a specified delay, useful for dynamically loaded content. - **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. -## Improvements and Optimizations +### Improvements and Optimizations -### 1. AsyncWebCrawler Enhancements +#### 1. AsyncWebCrawler Enhancements - **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy. - Allows for more customized setups. -### 2. Image Processing Optimization -- Enhanced image handling in WebScrappingStrategy. +#### 2. Image Processing Optimization +- Enhanced image handling in WebScrapingStrategy. - Added filtering for small, invisible, or irrelevant images. - Improved image scoring system for better content relevance. - Implemented JavaScript-based image dimension updating for more accurate representation. -### 3. Database Schema Auto-updates +#### 3. Database Schema Auto-updates - Automatic database schema updates ensure compatibility with the latest version. -### 4. Enhanced Error Handling and Logging +#### 4. Enhanced Error Handling and Logging - Improved error messages and logging for easier debugging. -### 5. Content Extraction Refinements +#### 5. Content Extraction Refinements - Refined HTML sanitization process. - Improved handling of base64 encoded images. - Enhanced Markdown conversion process. - Optimized content extraction algorithms. -### 6. Utility Function Enhancements +#### 6. Utility Function Enhancements - `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers. -## Bug Fixes +### Bug Fixes - Fixed an issue where image tags were being prematurely removed during content extraction. -## Examples and Documentation +### Examples and Documentation - Updated `quickstart_async.py` with examples of: - Using custom headers in LLM extraction. - Different LLM provider usage (OpenAI, Hugging Face, Ollama). - Custom browser type usage. -## Developer Notes +### Developer Notes - Refactored code for better maintainability, flexibility, and performance. - Enhanced type hinting throughout the codebase for improved development experience. - Expanded error handling for more robust operation. diff --git a/Dockerfile b/Dockerfile index 9a921d03..bd71deae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ ARG ENABLE_GPU=false # Platform-specific labels LABEL maintainer="unclecode" -LABEL description="Crawl4AI - Advanced Web Crawler with AI capabilities" +LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" LABEL version="1.0" # Environment setup @@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libatspi2.0-0 \ && rm -rf /var/lib/apt/lists/* -# GPU support if enabled -RUN if [ "$ENABLE_GPU" = "true" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ +# GPU support if enabled and architecture is supported +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ + else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ fi # Create and set working directory @@ -96,26 +98,32 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ # Install the package RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ - pip install -e ".[all]" && \ + pip install ".[all]" && \ python -m crawl4ai.model_loader ; \ elif [ "$INSTALL_TYPE" = "torch" ] ; then \ - pip install -e ".[torch]" ; \ + pip install ".[torch]" ; \ elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ - pip install -e ".[transformer]" && \ + pip install ".[transformer]" && \ python -m crawl4ai.model_loader ; \ else \ - pip install -e "." ; \ + pip install "." ; \ fi + # Install MkDocs and required plugins +RUN pip install --no-cache-dir \ + mkdocs \ + mkdocs-material \ + mkdocs-terminal \ + pymdown-extensions + +# Build MkDocs documentation +RUN mkdocs build + # Install Playwright and browsers RUN playwright install -# Health check -HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 - # Expose port -EXPOSE 8000 +EXPOSE 8000 11235 9222 8080 # Start the FastAPI server CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"] \ No newline at end of file diff --git a/README.md b/README.md index 7f0e5079..fa88a507 100644 --- a/README.md +++ b/README.md @@ -11,21 +11,21 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. 🆓🌐 -## 🌟 Meet the Crawl4AI Assistant: Your Copilot for Crawling +## New in 0.3.74 ✨ -Use the [Crawl4AI GPT Assistant](https://tinyurl.com/crawl4ai-gpt) as your AI-powered copilot! With this assistant, you can: +- 🚀 **Blazing Fast Scraping**: Significantly improved scraping speed. +- 📥 **Download Manager**: Integrated file crawling, downloading, and tracking within `CrawlResult`. +- 📝 **Markdown Strategy**: Flexible system for custom markdown generation and formats. +- 🔗 **LLM-Friendly Citations**: Auto-converts links to numbered citations with reference lists. +- 🔎 **Markdown Filter**: BM25-based content extraction for cleaner, relevant markdown. +- 🖼️ **Image Extraction**: Supports `srcset`, `picture`, and responsive image formats. +- 🗂️ **Local/Raw HTML**: Crawl `file://` paths and raw HTML (`raw:`) directly. +- 🤖 **Browser Control**: Custom browser setups with stealth integration to bypass bots. +- ☁️ **API & Cache Boost**: CORS, static serving, and enhanced filesystem-based caching. +- 🐳 **API Gateway**: Run as an API service with secure token authentication. +- 🛠️ **Database Upgrades**: Optimized for larger content sets with faster caching. +- 🐛 **Bug Fixes**: Resolved browser context issues, memory leaks, and improved error handling. -- 🧑‍💻 Generate code for complex crawling and extraction tasks -- 💡 Get tailored support and examples -- 📘 Learn Crawl4AI faster with step-by-step guidance - -## New in 0.3.73 ✨ - -- 🐳 Docker Ready: Full API server with seamless deployment & scaling -- 🎯 Browser Takeover: Use your own browser with cookies & history intact (CDP support) -- 📝 Mockdown+: Enhanced tag preservation & content extraction -- ⚡️ Parallel Power: Supercharged multi-URL crawling performance -- 🌟 And many more exciting updates... ## Try it Now! @@ -113,6 +113,20 @@ cd crawl4ai pip install -e . ``` +## One-Click Deployment 🚀 + +Deploy your own instance of Crawl4AI with one click: + +[![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge) + +> 💡 **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation. + +The deploy will: +- Set up a Docker container with Crawl4AI +- Configure Playwright and all dependencies +- Start the FastAPI server on port 11235 +- Set up health checks and auto-deployment + ### Using Docker 🐳 Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. @@ -127,6 +141,9 @@ docker pull unclecode/crawl4ai:gpu # GPU-enabled version # Run the container docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version + +# In case to allocate more shared memory for the container +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic ``` #### Option 2: Build from Repository diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0c6a2db4..0ccf13d8 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -1,13 +1,15 @@ # __init__.py -from .async_webcrawler import AsyncWebCrawler +from .async_webcrawler import AsyncWebCrawler, CacheMode + from .models import CrawlResult -from ._version import __version__ +from .__version__ import __version__ # __version__ = "0.3.73" __all__ = [ "AsyncWebCrawler", "CrawlResult", + "CacheMode", ] def is_sync_version_installed(): @@ -26,5 +28,5 @@ if is_sync_version_installed(): print("Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies.") else: WebCrawler = None - import warnings - print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file + # import warnings + # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.") \ No newline at end of file diff --git a/crawl4ai/_version.py b/crawl4ai/__version__.py similarity index 51% rename from crawl4ai/_version.py rename to crawl4ai/__version__.py index 85030f0e..65ee6e73 100644 --- a/crawl4ai/_version.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.73" \ No newline at end of file +__version__ = "0.3.74" \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 9af9f826..3f332eb0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -14,6 +14,7 @@ from pydantic import BaseModel import hashlib import json import uuid +from .models import AsyncCrawlResponse from playwright_stealth import StealthConfig, stealth_async @@ -34,13 +35,15 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None self.debugging_port = 9222 + self.logger = logger + self.shutting_down = False async def start(self) -> str: """ @@ -64,12 +67,50 @@ class ManagedBrowser: stdout=subprocess.PIPE, stderr=subprocess.PIPE ) + # Monitor browser process output for errors + asyncio.create_task(self._monitor_browser_process()) await asyncio.sleep(2) # Give browser time to start return f"http://localhost:{self.debugging_port}" except Exception as e: await self.cleanup() raise Exception(f"Failed to start browser: {e}") + async def _monitor_browser_process(self): + """Monitor the browser process for unexpected termination.""" + if self.browser_process: + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode() + } + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode} + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)} + ) + def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS @@ -118,30 +159,40 @@ class ManagedBrowser: async def cleanup(self): """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + if self.browser_process: try: self.browser_process.terminate() - await asyncio.sleep(1) + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running if self.browser_process.poll() is None: self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + except Exception as e: - print(f"Error terminating browser: {e}") + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)} + ) if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) except Exception as e: - print(f"Error removing temporary directory: {e}") + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)} + ) -class AsyncCrawlResponse(BaseModel): - html: str - response_headers: Dict[str, str] - status_code: int - screenshot: Optional[str] = None - get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None - - class Config: - arbitrary_types_allowed = True class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -165,7 +216,8 @@ class AsyncCrawlerStrategy(ABC): pass class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): + def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): + self.logger = logger self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", @@ -177,6 +229,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.headless = kwargs.get("headless", True) self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) + self.cookies = kwargs.get("cookies", []) self.sessions = {} self.session_ttl = 1800 self.js_code = js_code @@ -186,6 +239,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.sleep_on_close = kwargs.get("sleep_on_close", False) self.use_managed_browser = kwargs.get("use_managed_browser", False) self.user_data_dir = kwargs.get("user_data_dir", None) + self.use_persistent_context = kwargs.get("use_persistent_context", False) + self.chrome_channel = kwargs.get("chrome_channel", "chrome") self.managed_browser = None self.default_context = None self.hooks = { @@ -197,6 +252,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'before_return_html': None, 'before_retrieve_html': None } + self.extra_args = kwargs.get("extra_args", []) + self.accept_downloads = kwargs.get("accept_downloads", False) + self.downloads_path = kwargs.get("downloads_path") + self._downloaded_files = [] # Track downloaded files for current crawl + if self.accept_downloads and not self.downloads_path: + self.downloads_path = os.path.join(os.getcwd(), "downloads") + os.makedirs(self.downloads_path, exist_ok=True) + async def __aenter__(self): await self.start() @@ -214,7 +277,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.managed_browser = ManagedBrowser( browser_type=self.browser_type, user_data_dir=self.user_data_dir, - headless=self.headless + headless=self.headless, + logger=self.logger ) cdp_url = await self.managed_browser.start() self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) @@ -232,42 +296,90 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Set up the default context if self.default_context: await self.default_context.set_extra_http_headers(self.headers) - + if self.cookies: + await self.default_context.add_cookies(self.cookies) + if self.accept_downloads: + await self.default_context.set_default_timeout(60000) + await self.default_context.set_default_navigation_timeout(60000) + self.default_context._impl_obj._options["accept_downloads"] = True + self.default_context._impl_obj._options["downloads_path"] = self.downloads_path + if self.user_agent: await self.default_context.set_extra_http_headers({ "User-Agent": self.user_agent }) else: + # Base browser arguments browser_args = { "headless": self.headless, "args": [ - "--disable-gpu", "--no-sandbox", "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", + "--no-first-run", + "--no-default-browser-check", "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", "--ignore-certificate-errors-spki-list", - # "--headless=new", # Use the new headless mode ] } + + # Add channel if specified (try Chrome first) + if self.chrome_channel: + browser_args["channel"] = self.chrome_channel + + # Add extra args if provided + if self.extra_args: + browser_args["args"].extend(self.extra_args) + + # Add downloads path if downloads are enabled + if self.accept_downloads: + browser_args["downloads_path"] = self.downloads_path # Add proxy settings if a proxy is specified if self.proxy: proxy_settings = ProxySettings(server=self.proxy) browser_args["proxy"] = proxy_settings elif self.proxy_config: - proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password")) + proxy_settings = ProxySettings( + server=self.proxy_config.get("server"), + username=self.proxy_config.get("username"), + password=self.proxy_config.get("password") + ) browser_args["proxy"] = proxy_settings - # Select the appropriate browser based on the browser_type - if self.browser_type == "firefox": - self.browser = await self.playwright.firefox.launch(**browser_args) - elif self.browser_type == "webkit": - self.browser = await self.playwright.webkit.launch(**browser_args) - else: - self.browser = await self.playwright.chromium.launch(**browser_args) + try: + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + accept_downloads=self.accept_downloads, + downloads_path=self.downloads_path if self.accept_downloads else None, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + + except Exception as e: + # Fallback to chromium if Chrome channel fails + if "chrome" in str(e) and browser_args.get("channel") == "chrome": + browser_args["channel"] = "chromium" + if self.use_persistent_context and self.user_data_dir: + self.browser = await self.playwright.chromium.launch_persistent_context( + user_data_dir=self.user_data_dir, + **browser_args + ) + self.default_context = self.browser + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + else: + raise await self.execute_hook('on_browser_created', self.browser) @@ -285,6 +397,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.browser = None if self.managed_browser: + await asyncio.sleep(0.5) await self.managed_browser.cleanup() self.managed_browser = None @@ -292,9 +405,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.playwright.stop() self.playwright = None - def __del__(self): - if self.browser or self.playwright: - asyncio.get_event_loop().run_until_complete(self.close()) + # Issue #256: Remove __del__ method to avoid potential issues with async cleanup + # def __del__(self): + # if self.browser or self.playwright: + # asyncio.get_event_loop().run_until_complete(self.close()) def set_hook(self, hook_type: str, hook: Callable): if hook_type in self.hooks: @@ -431,17 +545,99 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }} """) else: - print(f"Warning: Could not access content frame for iframe {i}") + # print(f"Warning: Could not access content frame for iframe {i}") + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", + params={"index": i} + ) except Exception as e: - print(f"Error processing iframe {i}: {str(e)}") + self.logger.error( + message="Error processing iframe {index}: {error}", + tag="ERROR", + params={"index": i, "error": str(e)} + ) + # print(f"Error processing iframe {i}: {str(e)}") # Return the page object return page async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Crawls a given URL or processes raw HTML/local file content based on the URL prefix. + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw:': Raw HTML content to process. + **kwargs: Additional parameters: + - 'screenshot' (bool): Whether to take a screenshot. + - ... [other existing parameters] + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ + response_headers = {} + status_code = 200 # Default to 200 for local/raw HTML + screenshot_requested = kwargs.get('screenshot', False) + screenshot_data = None + + if url.startswith(('http://', 'https://')): + # Proceed with standard web crawling + return await self._crawl_web(url, **kwargs) + + elif url.startswith('file://'): + # Process local file + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, 'r', encoding='utf-8') as f: + html = f.read() + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + + elif url.startswith('raw:'): + # Process raw HTML content + raw_html = url[4:] # Remove 'raw:' prefix + html = raw_html + if screenshot_requested: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None + ) + else: + raise ValueError("URL must start with 'http://', 'https://', 'file://', or 'raw:'") + + + async def _crawl_web(self, url: str, **kwargs) -> AsyncCrawlResponse: + """ + Existing web crawling logic remains unchanged. + + Args: + url (str): The web URL to crawl. + **kwargs: Additional parameters. + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ response_headers = {} status_code = None + # Reset downloaded files list for new crawl + self._downloaded_files = [] + self._cleanup_expired_sessions() session_id = kwargs.get("session_id") @@ -461,24 +657,41 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if session_id: context, page, _ = self.sessions.get(session_id, (None, None, None)) if not context: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + page = await context.new_page() + else: + # Normal context creation for non-persistent or non-Chrome browsers + context = await self.browser.new_context( + user_agent=self.user_agent, + viewport={"width": 1200, "height": 800}, + proxy={"server": self.proxy} if self.proxy else None, + java_script_enabled=True, + accept_downloads=self.accept_downloads, + # downloads_path=self.downloads_path if self.accept_downloads else None + ) + await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + if self.cookies: + await context.add_cookies(self.cookies) + await context.set_extra_http_headers(self.headers) + page = await context.new_page() + self.sessions[session_id] = (context, page, time.time()) + else: + if self.use_persistent_context and self.browser_type in ["chrome", "chromium"]: + # In persistent context, browser is the context + context = self.browser + else: + # Normal context creation context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, proxy={"server": self.proxy} if self.proxy else None, - accept_downloads=True, - java_script_enabled=True + accept_downloads=self.accept_downloads, ) - await context.add_cookies([{"name": "cookiesEnabled", "value": "true", "url": url}]) + if self.cookies: + await context.add_cookies(self.cookies) await context.set_extra_http_headers(self.headers) - page = await context.new_page() - self.sessions[session_id] = (context, page, time.time()) - else: - context = await self.browser.new_context( - user_agent=self.user_agent, - viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None - ) - await context.set_extra_http_headers(self.headers) if kwargs.get("override_navigator", False) or kwargs.get("simulate_user", False) or kwargs.get("magic", False): # Inject scripts to override navigator properties @@ -512,7 +725,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """) page = await context.new_page() - # await stealth_async(page) #, stealth_config) + if kwargs.get("magic", False): + await stealth_async(page, stealth_config) # Add console message and error logging if kwargs.get("log_console", False): @@ -520,8 +734,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.on("pageerror", lambda exc: print(f"Page Error: {exc}")) try: - if self.verbose: - print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") + # Set up download handling if enabled + if self.accept_downloads: + page.on("download", lambda download: asyncio.create_task(self._handle_download(download))) + + # if self.verbose: + # print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...") if self.use_cached_html: cache_file_path = os.path.join( @@ -544,8 +762,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) + response = await page.goto( - url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000) + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + timeout=kwargs.get("page_timeout", 60000) ) # response = await page.goto("about:blank") @@ -613,7 +835,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for js in js_code: await page.evaluate(js) - await page.wait_for_load_state('networkidle') + # await page.wait_for_timeout(100) + # Check for on execution event await self.execute_hook('on_execution_started', page) @@ -631,6 +854,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") + + # if not wait_for and js_code: + # await page.wait_for_load_state('networkidle', timeout=5000) # Update image dimensions update_image_dimensions_js = """ @@ -720,9 +946,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await asyncio.sleep(screenshot_wait_for) screenshot_data = await self.take_screenshot(page) - if self.verbose: - print(f"[LOG] ✅ Crawled {url} successfully!") - + # if self.verbose: + # print(f"[LOG] ✅ Crawled {url} successfully!") + if self.use_cached_html: cache_file_path = os.path.join( os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() @@ -747,16 +973,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, - get_delayed_content=get_delayed_content + get_delayed_content=get_delayed_content, + downloaded_files=self._downloaded_files if self._downloaded_files else None ) return response except Error as e: - raise Error(f"[ERROR] 🚫 crawl(): Failed to crawl {url}: {str(e)}") + raise Error(f"async_crawler_strategy.py:_crawleb(): {str(e)}") # finally: # if not session_id: # await page.close() # await context.close() + async def _handle_download(self, download): + """Handle file downloads.""" + try: + suggested_filename = download.suggested_filename + download_path = os.path.join(self.downloads_path, suggested_filename) + + self.logger.info( + message="Downloading {filename} to {path}", + tag="FETCH", + params={"filename": suggested_filename, "path": download_path} + ) + + start_time = time.perf_counter() + await download.save_as(download_path) + end_time = time.perf_counter() + self._downloaded_files.append(download_path) + + self.logger.success( + message="Downloaded {filename} successfully", + tag="COMPLETE", + params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} + ) + except Exception as e: + self.logger.error( + message="Failed to handle download: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + # if self.verbose: + # print(f"[ERROR] Failed to handle download: {str(e)}") + async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed semaphore = asyncio.Semaphore(semaphore_count) @@ -898,17 +1157,36 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.evaluate(remove_overlays_js) await page.wait_for_timeout(500) # Wait for any animations to complete except Exception as e: - if self.verbose: - print(f"Warning: Failed to remove overlay elements: {str(e)}") + self.logger.warning( + message="Failed to remove overlay elements: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # if self.verbose: + # print(f"Warning: Failed to remove overlay elements: {str(e)}") async def take_screenshot(self, page: Page) -> str: + """ + Takes a screenshot of the current page. + + Args: + page (Page): The Playwright page instance + + Returns: + str: Base64-encoded screenshot image + """ try: # The page is already loaded, just take the screenshot screenshot = await page.screenshot(full_page=True) return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # Generate an error image img = Image.new('RGB', (800, 600), color='black') @@ -921,4 +1199,41 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(buffered.getvalue()).decode('utf-8') finally: await page.close() + + async def _generate_screenshot_from_html(self, html: str) -> Optional[str]: + """ + Generates a screenshot from raw HTML content. + + Args: + html (str): The HTML content to render and capture. + + Returns: + Optional[str]: Base64-encoded screenshot image or an error image if failed. + """ + try: + if not self.browser: + await self.start() + page = await self.browser.new_page() + await page.set_content(html, wait_until='networkidle') + screenshot = await page.screenshot(full_page=True) + await page.close() + return base64.b64encode(screenshot).decode('utf-8') + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + # print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + + # Generate an error image + img = Image.new('RGB', (800, 600), color='black') + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode('utf-8') diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 249c4b31..3c97e7d1 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -5,28 +5,89 @@ import asyncio from typing import Optional, Tuple, Dict from contextlib import asynccontextmanager import logging - +import json # Added for serialization/deserialization +from .utils import ensure_content_dirs, generate_content_hash +from .models import CrawlResult +import xxhash +import aiofiles +from .config import NEED_MIGRATION +from .version_manager import VersionManager +from .async_logger import AsyncLogger # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") +base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) -DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") +DB_PATH = os.path.join(base_directory, "crawl4ai.db") class AsyncDatabaseManager: def __init__(self, pool_size: int = 10, max_retries: int = 3): self.db_path = DB_PATH + self.content_paths = ensure_content_dirs(os.path.dirname(DB_PATH)) self.pool_size = pool_size self.max_retries = max_retries self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.pool_lock = asyncio.Lock() + self.init_lock = asyncio.Lock() self.connection_semaphore = asyncio.Semaphore(pool_size) + self._initialized = False + self.version_manager = VersionManager() + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"), + verbose=False, + tag_width=10 + ) + async def initialize(self): """Initialize the database and connection pool""" - await self.ainit_db() - + try: + self.logger.info("Initializing database", tag="INIT") + # Ensure the database file exists + os.makedirs(os.path.dirname(self.db_path), exist_ok=True) + + # Check if version update is needed + needs_update = self.version_manager.needs_update() + + # Always ensure base table exists + await self.ainit_db() + + # Verify the table exists + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: + async with db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" + ) as cursor: + result = await cursor.fetchone() + if not result: + raise Exception("crawled_data table was not created") + + # If version changed or fresh install, run updates + if needs_update: + self.logger.info("New version detected, running updates", tag="INIT") + await self.update_db_schema() + from .migrations import run_migration # Import here to avoid circular imports + await run_migration() + self.version_manager.update_version() # Update stored version after successful migration + self.logger.success("Version update completed successfully", tag="COMPLETE") + else: + self.logger.success("Database initialization completed successfully", tag="COMPLETE") + + + except Exception as e: + self.logger.error( + message="Database initialization error: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.logger.info( + message="Database will be initialized on first use", + tag="INIT" + ) + + raise + + async def cleanup(self): """Cleanup connections when shutting down""" async with self.pool_lock: @@ -37,29 +98,43 @@ class AsyncDatabaseManager: @asynccontextmanager async def get_connection(self): """Connection pool manager""" - async with self.connection_semaphore: - task_id = id(asyncio.current_task()) - try: - async with self.pool_lock: - if task_id not in self.connection_pool: - conn = await aiosqlite.connect( - self.db_path, - timeout=30.0 - ) - await conn.execute('PRAGMA journal_mode = WAL') - await conn.execute('PRAGMA busy_timeout = 5000') - self.connection_pool[task_id] = conn - - yield self.connection_pool[task_id] - - except Exception as e: - logger.error(f"Connection error: {e}") - raise - finally: - async with self.pool_lock: - if task_id in self.connection_pool: - await self.connection_pool[task_id].close() - del self.connection_pool[task_id] + if not self._initialized: + # Use an asyncio.Lock to ensure only one initialization occurs + async with self.init_lock: + if not self._initialized: + await self.initialize() + self._initialized = True + + await self.connection_semaphore.acquire() + task_id = id(asyncio.current_task()) + try: + async with self.pool_lock: + if task_id not in self.connection_pool: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + self.connection_pool[task_id] = conn + + yield self.connection_pool[task_id] + + except Exception as e: + self.logger.error( + message="Connection error: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + raise + finally: + async with self.pool_lock: + if task_id in self.connection_pool: + await self.connection_pool[task_id].close() + del self.connection_pool[task_id] + self.connection_semaphore.release() + async def execute_with_retry(self, operation, *args): """Execute database operations with retry logic""" @@ -71,13 +146,21 @@ class AsyncDatabaseManager: return result except Exception as e: if attempt == self.max_retries - 1: - logger.error(f"Operation failed after {self.max_retries} attempts: {e}") + self.logger.error( + message="Operation failed after {retries} attempts: {error}", + tag="ERROR", + force_verbose=True, + params={ + "retries": self.max_retries, + "error": str(e) + } + ) raise await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff async def ainit_db(self): """Initialize database schema""" - async def _init(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: await db.execute(''' CREATE TABLE IF NOT EXISTS crawled_data ( url TEXT PRIMARY KEY, @@ -89,71 +172,168 @@ class AsyncDatabaseManager: media TEXT DEFAULT "{}", links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", - screenshot TEXT DEFAULT "" + screenshot TEXT DEFAULT "", + response_headers TEXT DEFAULT "{}", + downloaded_files TEXT DEFAULT "{}" -- New column added ) ''') + await db.commit() + - await self.execute_with_retry(_init) - await self.update_db_schema() async def update_db_schema(self): """Update database schema if needed""" - async def _check_columns(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: cursor = await db.execute("PRAGMA table_info(crawled_data)") columns = await cursor.fetchall() - return [column[1] for column in columns] + column_names = [column[1] for column in columns] + + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] + + for column in new_columns: + if column not in column_names: + await self.aalter_db_add_column(column, db) + await db.commit() - column_names = await self.execute_with_retry(_check_columns) - - for column in ['media', 'links', 'metadata', 'screenshot']: - if column not in column_names: - await self.aalter_db_add_column(column) - - async def aalter_db_add_column(self, new_column: str): + async def aalter_db_add_column(self, new_column: str, db): """Add new column to the database""" - async def _alter(db): + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') - logger.info(f"Added column '{new_column}' to the database.") + self.logger.info( + message="Added column '{column}' to the database", + tag="INIT", + params={"column": new_column} + ) - await self.execute_with_retry(_alter) - async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: - """Retrieve cached URL data""" + async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: + """Retrieve cached URL data as CrawlResult""" async def _get(db): async with db.execute( - 'SELECT url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot FROM crawled_data WHERE url = ?', - (url,) + 'SELECT * FROM crawled_data WHERE url = ?', (url,) ) as cursor: - return await cursor.fetchone() + row = await cursor.fetchone() + if not row: + return None + + # Get column names + columns = [description[0] for description in cursor.description] + # Create dict from row data + row_dict = dict(zip(columns, row)) + + # Load content from files using stored hashes + content_fields = { + 'html': row_dict['html'], + 'cleaned_html': row_dict['cleaned_html'], + 'markdown': row_dict['markdown'], + 'extracted_content': row_dict['extracted_content'], + 'screenshot': row_dict['screenshot'] + } + + for field, hash_value in content_fields.items(): + if hash_value: + content = await self._load_content( + hash_value, + field.split('_')[0] # Get content type from field name + ) + row_dict[field] = content or "" + else: + row_dict[field] = "" + + # Parse JSON fields + json_fields = ['media', 'links', 'metadata', 'response_headers'] + for field in json_fields: + try: + row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {} + except json.JSONDecodeError: + row_dict[field] = {} + + # Parse downloaded_files + try: + row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else [] + except json.JSONDecodeError: + row_dict['downloaded_files'] = [] + + # Remove any fields not in CrawlResult model + valid_fields = CrawlResult.__annotations__.keys() + filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields} + + return CrawlResult(**filtered_dict) try: return await self.execute_with_retry(_get) except Exception as e: - logger.error(f"Error retrieving cached URL: {e}") + self.logger.error( + message="Error retrieving cached URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return None - async def acache_url(self, url: str, html: str, cleaned_html: str, markdown: str, extracted_content: str, success: bool, media: str = "{}", links: str = "{}", metadata: str = "{}", screenshot: str = ""): - """Cache URL data with retry logic""" + async def acache_url(self, result: CrawlResult): + """Cache CrawlResult data""" + # Store content files and get hashes + content_map = { + 'html': (result.html, 'html'), + 'cleaned_html': (result.cleaned_html or "", 'cleaned'), + 'markdown': (result.markdown or "", 'markdown'), + 'extracted_content': (result.extracted_content or "", 'extracted'), + 'screenshot': (result.screenshot or "", 'screenshots') + } + + content_hashes = {} + for field, (content, content_type) in content_map.items(): + content_hashes[field] = await self._store_content(content, content_type) + async def _cache(db): await db.execute(''' - INSERT INTO crawled_data (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + INSERT INTO crawled_data ( + url, html, cleaned_html, markdown, + extracted_content, success, media, links, metadata, + screenshot, response_headers, downloaded_files + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(url) DO UPDATE SET html = excluded.html, cleaned_html = excluded.cleaned_html, markdown = excluded.markdown, extracted_content = excluded.extracted_content, success = excluded.success, - media = excluded.media, - links = excluded.links, - metadata = excluded.metadata, - screenshot = excluded.screenshot - ''', (url, html, cleaned_html, markdown, extracted_content, success, media, links, metadata, screenshot)) + media = excluded.media, + links = excluded.links, + metadata = excluded.metadata, + screenshot = excluded.screenshot, + response_headers = excluded.response_headers, + downloaded_files = excluded.downloaded_files + ''', ( + result.url, + content_hashes['html'], + content_hashes['cleaned_html'], + content_hashes['markdown'], + content_hashes['extracted_content'], + result.success, + json.dumps(result.media), + json.dumps(result.links), + json.dumps(result.metadata or {}), + content_hashes['screenshot'], + json.dumps(result.response_headers or {}), + json.dumps(result.downloaded_files or []) + )) try: await self.execute_with_retry(_cache) except Exception as e: - logger.error(f"Error caching URL: {e}") + self.logger.error( + message="Error caching URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + async def aget_total_count(self) -> int: """Get total number of cached URLs""" @@ -165,7 +345,12 @@ class AsyncDatabaseManager: try: return await self.execute_with_retry(_count) except Exception as e: - logger.error(f"Error getting total count: {e}") + self.logger.error( + message="Error getting total count: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return 0 async def aclear_db(self): @@ -176,7 +361,12 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_clear) except Exception as e: - logger.error(f"Error clearing database: {e}") + self.logger.error( + message="Error clearing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) async def aflush_db(self): """Drop the entire table""" @@ -186,7 +376,46 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_flush) except Exception as e: - logger.error(f"Error flushing database: {e}") + self.logger.error( + message="Error flushing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + + + async def _store_content(self, content: str, content_type: str) -> str: + """Store content in filesystem and return hash""" + if not content: + return "" + + content_hash = generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + # Only write if file doesn't exist + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]: + """Load content from filesystem by hash""" + if not content_hash: + return None + + file_path = os.path.join(self.content_paths[content_type], content_hash) + try: + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + return await f.read() + except: + self.logger.error( + message="Failed to load content: {file_path}", + tag="ERROR", + force_verbose=True, + params={"file_path": file_path} + ) + return None # Create a singleton instance -async_db_manager = AsyncDatabaseManager() \ No newline at end of file +async_db_manager = AsyncDatabaseManager() diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py new file mode 100644 index 00000000..220edd11 --- /dev/null +++ b/crawl4ai/async_logger.py @@ -0,0 +1,231 @@ +from enum import Enum +from typing import Optional, Dict, Any, Union +from colorama import Fore, Back, Style, init +import time +import os +from datetime import datetime + +class LogLevel(Enum): + DEBUG = 1 + INFO = 2 + SUCCESS = 3 + WARNING = 4 + ERROR = 5 + +class AsyncLogger: + """ + Asynchronous logger with support for colored console output and file logging. + Supports templated messages with colored components. + """ + + DEFAULT_ICONS = { + 'INIT': '→', + 'READY': '✓', + 'FETCH': '↓', + 'SCRAPE': '◆', + 'EXTRACT': '■', + 'COMPLETE': '●', + 'ERROR': '×', + 'DEBUG': '⋯', + 'INFO': 'ℹ', + 'WARNING': '⚠', + } + + DEFAULT_COLORS = { + LogLevel.DEBUG: Fore.LIGHTBLACK_EX, + LogLevel.INFO: Fore.CYAN, + LogLevel.SUCCESS: Fore.GREEN, + LogLevel.WARNING: Fore.YELLOW, + LogLevel.ERROR: Fore.RED, + } + + def __init__( + self, + log_file: Optional[str] = None, + log_level: LogLevel = LogLevel.INFO, + tag_width: int = 10, + icons: Optional[Dict[str, str]] = None, + colors: Optional[Dict[LogLevel, str]] = None, + verbose: bool = True + ): + """ + Initialize the logger. + + Args: + log_file: Optional file path for logging + log_level: Minimum log level to display + tag_width: Width for tag formatting + icons: Custom icons for different tags + colors: Custom colors for different log levels + verbose: Whether to output to console + """ + init() # Initialize colorama + self.log_file = log_file + self.log_level = log_level + self.tag_width = tag_width + self.icons = icons or self.DEFAULT_ICONS + self.colors = colors or self.DEFAULT_COLORS + self.verbose = verbose + + # Create log file directory if needed + if log_file: + os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) + + def _format_tag(self, tag: str) -> str: + """Format a tag with consistent width.""" + return f"[{tag}]".ljust(self.tag_width, ".") + + def _get_icon(self, tag: str) -> str: + """Get the icon for a tag, defaulting to info icon if not found.""" + return self.icons.get(tag, self.icons['INFO']) + + def _write_to_file(self, message: str): + """Write a message to the log file if configured.""" + if self.log_file: + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + with open(self.log_file, 'a', encoding='utf-8') as f: + # Strip ANSI color codes for file output + clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '') + for color in vars(Fore).values(): + if isinstance(color, str): + clean_message = clean_message.replace(color, '') + f.write(f"[{timestamp}] {clean_message}\n") + + def _log( + self, + level: LogLevel, + message: str, + tag: str, + params: Optional[Dict[str, Any]] = None, + colors: Optional[Dict[str, str]] = None, + base_color: Optional[str] = None, + **kwargs + ): + """ + Core logging method that handles message formatting and output. + + Args: + level: Log level for this message + message: Message template string + tag: Tag for the message + params: Parameters to format into the message + colors: Color overrides for specific parameters + base_color: Base color for the entire message + """ + if level.value < self.log_level.value: + return + + # Format the message with parameters if provided + if params: + try: + # First format the message with raw parameters + formatted_message = message.format(**params) + + # Then apply colors if specified + if colors: + for key, color in colors.items(): + # Find the formatted value in the message and wrap it with color + if key in params: + value_str = str(params[key]) + formatted_message = formatted_message.replace( + value_str, + f"{color}{value_str}{Style.RESET_ALL}" + ) + + except KeyError as e: + formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template" + level = LogLevel.ERROR + else: + formatted_message = message + + # Construct the full log line + color = base_color or self.colors[level] + log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}" + + # Output to console if verbose + if self.verbose or kwargs.get("force_verbose", False): + print(log_line) + + # Write to file if configured + self._write_to_file(log_line) + + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + """Log a debug message.""" + self._log(LogLevel.DEBUG, message, tag, **kwargs) + + def info(self, message: str, tag: str = "INFO", **kwargs): + """Log an info message.""" + self._log(LogLevel.INFO, message, tag, **kwargs) + + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + """Log a success message.""" + self._log(LogLevel.SUCCESS, message, tag, **kwargs) + + def warning(self, message: str, tag: str = "WARNING", **kwargs): + """Log a warning message.""" + self._log(LogLevel.WARNING, message, tag, **kwargs) + + def error(self, message: str, tag: str = "ERROR", **kwargs): + """Log an error message.""" + self._log(LogLevel.ERROR, message, tag, **kwargs) + + def url_status( + self, + url: str, + success: bool, + timing: float, + tag: str = "FETCH", + url_length: int = 50 + ): + """ + Convenience method for logging URL fetch status. + + Args: + url: The URL being processed + success: Whether the operation was successful + timing: Time taken for the operation + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.SUCCESS if success else LogLevel.ERROR, + message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "status": success, + "timing": timing + }, + colors={ + "status": Fore.GREEN if success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + def error_status( + self, + url: str, + error: str, + tag: str = "ERROR", + url_length: int = 50 + ): + """ + Convenience method for logging error status. + + Args: + url: The URL being processed + error: Error message + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.ERROR, + message="{url:.{url_length}}... | Error: {error}", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "error": error + } + ) \ No newline at end of file diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 38e429ca..b8be6f35 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -1,36 +1,106 @@ import os import time +import warnings +from enum import Enum +from colorama import init, Fore, Back, Style from pathlib import Path -from typing import Optional +from typing import Optional, List, Union import json import asyncio -from .models import CrawlResult +from .models import CrawlResult, MarkdownGenerationResult from .async_database import async_db_manager from .chunking_strategy import * +from .content_filter_strategy import * from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse -from .content_scrapping_strategy import WebScrappingStrategy -from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD +from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode +from .content_scraping_strategy import WebScrapingStrategy +from .async_logger import AsyncLogger + +from .config import ( + MIN_WORD_THRESHOLD, + IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, + URL_LOG_SHORTEN_LENGTH +) from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html ) -from ._version import __version__ as crawl4ai_version +from urllib.parse import urlparse +import random +from .__version__ import __version__ as crawl4ai_version + class AsyncWebCrawler: + """ + Asynchronous web crawler with flexible caching capabilities. + + Migration Guide (from version X.X.X): + Old way (deprecated): + crawler = AsyncWebCrawler(always_by_pass_cache=True) + result = await crawler.arun( + url="https://example.com", + bypass_cache=True, + no_cache_read=True, + no_cache_write=False + ) + + New way (recommended): + crawler = AsyncWebCrawler(always_bypass_cache=True) + result = await crawler.arun( + url="https://example.com", + cache_mode=CacheMode.WRITE_ONLY + ) + + To disable deprecation warnings: + Pass warning=False to suppress the warning. + """ + _domain_last_hit = {} + def __init__( self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, - always_by_pass_cache: bool = False, + always_bypass_cache: bool = False, + always_by_pass_cache: Optional[bool] = None, # Deprecated parameter base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), **kwargs, ): + """ + Initialize the AsyncWebCrawler. + + Args: + crawler_strategy: Strategy for crawling web pages + always_bypass_cache: Whether to always bypass cache (new parameter) + always_by_pass_cache: Deprecated, use always_bypass_cache instead + base_directory: Base directory for storing cache + """ + self.verbose = kwargs.get("verbose", False) + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), + verbose=self.verbose, + tag_width=10 + ) + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( + logger = self.logger, **kwargs ) - self.always_by_pass_cache = always_by_pass_cache - # self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") + + # Handle deprecated parameter + if always_by_pass_cache is not None: + if kwargs.get("warning", True): + warnings.warn( + "'always_by_pass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'always_bypass_cache' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + self.always_bypass_cache = always_by_pass_cache + else: + self.always_bypass_cache = always_bypass_cache + self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) @@ -46,21 +116,14 @@ class AsyncWebCrawler: await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) async def awarmup(self): - # Print a message for crawl4ai and its version - print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}") - if self.verbose: - print("[LOG] 🌤️ Warming up the AsyncWebCrawler") - # await async_db_manager.ainit_db() - await async_db_manager.initialize() - await self.arun( - url="https://google.com/", - word_count_threshold=5, - bypass_cache=False, - verbose=False, - ) + """Initialize the crawler with warm-up sequence.""" + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + # if self.verbose: + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") self.ready = True - if self.verbose: - print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") + # if self.verbose: + # print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") async def arun( self, @@ -68,14 +131,82 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, + cache_mode: Optional[CacheMode] = None, + # Deprecated parameters bypass_cache: bool = False, + disable_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False, + # Other parameters css_selector: str = None, screenshot: bool = False, user_agent: str = None, verbose=True, **kwargs, ) -> CrawlResult: + """ + Runs the crawler for a single source: URL (web, local file, or raw HTML). + + Migration from legacy cache parameters: + Old way (deprecated): + await crawler.arun(url, bypass_cache=True, no_cache_read=True) + + New way: + await crawler.arun(url, cache_mode=CacheMode.BYPASS) + + Args: + url: The URL to crawl (http://, https://, file://, or raw:) + cache_mode: Cache behavior control (recommended) + word_count_threshold: Minimum word count threshold + extraction_strategy: Strategy for content extraction + chunking_strategy: Strategy for content chunking + css_selector: CSS selector for content extraction + screenshot: Whether to capture screenshot + user_agent: Custom user agent + verbose: Enable verbose logging + + Deprecated Args: + bypass_cache: Use cache_mode=CacheMode.BYPASS instead + disable_cache: Use cache_mode=CacheMode.DISABLED instead + no_cache_read: Use cache_mode=CacheMode.WRITE_ONLY instead + no_cache_write: Use cache_mode=CacheMode.READ_ONLY instead + + Returns: + CrawlResult: The result of crawling and processing + """ try: + # Handle deprecated parameters + if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): + if kwargs.get("warning", True): + warnings.warn( + "Cache control boolean flags are deprecated and will be removed in version X.X.X. " + "Use 'cache_mode' parameter instead. Examples:\n" + "- For bypass_cache=True, use cache_mode=CacheMode.BYPASS\n" + "- For disable_cache=True, use cache_mode=CacheMode.DISABLED\n" + "- For no_cache_read=True, use cache_mode=CacheMode.WRITE_ONLY\n" + "- For no_cache_write=True, use cache_mode=CacheMode.READ_ONLY\n" + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + + # Convert legacy parameters if cache_mode not provided + if cache_mode is None: + cache_mode = _legacy_to_cache_mode( + disable_cache=disable_cache, + bypass_cache=bypass_cache, + no_cache_read=no_cache_read, + no_cache_write=no_cache_write + ) + + # Default to ENABLED if no cache mode specified + if cache_mode is None: + cache_mode = CacheMode.ENABLED + + # Create cache context + cache_context = CacheContext(url, cache_mode, self.always_bypass_cache) + extraction_strategy = extraction_strategy or NoExtractionStrategy() extraction_strategy.verbose = verbose if not isinstance(extraction_strategy, ExtractionStrategy): @@ -86,61 +217,126 @@ class AsyncWebCrawler: word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) async_response: AsyncCrawlResponse = None - cached = None + cached_result = None screenshot_data = None extracted_content = None - if not bypass_cache and not self.always_by_pass_cache: - cached = await async_db_manager.aget_cached_url(url) - - if kwargs.get("warmup", True) and not self.ready: - return None - - if cached: - html = sanitize_input_encode(cached[1]) - extracted_content = sanitize_input_encode(cached[4]) + + start_time = time.perf_counter() + + # Try to get cached result if appropriate + if cache_context.should_read(): + cached_result = await async_db_manager.aget_cached_url(url) + + if cached_result: + html = sanitize_input_encode(cached_result.html) + extracted_content = sanitize_input_encode(cached_result.extracted_content or "") if screenshot: - screenshot_data = cached[9] + screenshot_data = cached_result.screenshot if not screenshot_data: - cached = None + cached_result = None + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" + ) - if not cached or not html: - t1 = time.time() + + # Fetch fresh content if needed + if not cached_result or not html: + t1 = time.perf_counter() + if user_agent: self.crawler_strategy.update_user_agent(user_agent) - async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) + async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl( + url, + screenshot=screenshot, + **kwargs + ) html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot - t2 = time.time() - if verbose: - print( - f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" - ) + t2 = time.perf_counter() + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=t2 - t1, + tag="FETCH" + ) + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + # Process the HTML content crawl_result = await self.aprocess_html( - url, - html, - extracted_content, - word_count_threshold, - extraction_strategy, - chunking_strategy, - css_selector, - screenshot_data, - verbose, - bool(cached), + url=url, + html=html, + extracted_content=extracted_content, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + content_filter=content_filter, + css_selector=css_selector, + screenshot=screenshot_data, + verbose=verbose, + is_cached=bool(cached_result), async_response=async_response, - bypass_cache=bypass_cache, + is_web_url=cache_context.is_web_url, + is_local_file=cache_context.is_local_file, + is_raw_html=cache_context.is_raw_html, **kwargs, ) - crawl_result.status_code = async_response.status_code if async_response else 200 - crawl_result.response_headers = async_response.response_headers if async_response else {} + + # Set response data + if async_response: + crawl_result.status_code = async_response.status_code + crawl_result.response_headers = async_response.response_headers + crawl_result.downloaded_files = async_response.downloaded_files + else: + crawl_result.status_code = 200 + crawl_result.response_headers = cached_result.response_headers if cached_result else {} + crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) + + # if verbose: + # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + # Update cache if appropriate + if cache_context.should_write() and not bool(cached_result): + await async_db_manager.acache_url(crawl_result) + return crawl_result + except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}") - return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg) + # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + self.logger.error_status( + url=cache_context.display_url, + error=e.msg, + tag="ERROR" + ) + return CrawlResult( + url=url, + html="", + markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}", + success=False, + error_message=e.msg + ) async def arun_many( self, @@ -148,6 +344,9 @@ class AsyncWebCrawler: word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), + content_filter: RelevantContentFilter = None, + cache_mode: Optional[CacheMode] = None, + # Deprecated parameters bypass_cache: bool = False, css_selector: str = None, screenshot: bool = False, @@ -155,22 +354,102 @@ class AsyncWebCrawler: verbose=True, **kwargs, ) -> List[CrawlResult]: - tasks = [ - self.arun( - url, - word_count_threshold, - extraction_strategy, - chunking_strategy, - bypass_cache, - css_selector, - screenshot, - user_agent, - verbose, - **kwargs - ) - for url in urls - ] - return await asyncio.gather(*tasks) + """ + Runs the crawler for multiple URLs concurrently. + + Migration from legacy parameters: + Old way (deprecated): + results = await crawler.arun_many(urls, bypass_cache=True) + + New way: + results = await crawler.arun_many(urls, cache_mode=CacheMode.BYPASS) + + Args: + urls: List of URLs to crawl + cache_mode: Cache behavior control (recommended) + [other parameters same as arun()] + + Returns: + List[CrawlResult]: Results for each URL + """ + if bypass_cache: + if kwargs.get("warning", True): + warnings.warn( + "'bypass_cache' is deprecated and will be removed in version X.X.X. " + "Use 'cache_mode=CacheMode.BYPASS' instead. " + "Pass warning=False to suppress this warning.", + DeprecationWarning, + stacklevel=2 + ) + if cache_mode is None: + cache_mode = CacheMode.BYPASS + + semaphore_count = kwargs.get('semaphore_count', 10) + semaphore = asyncio.Semaphore(semaphore_count) + + async def crawl_with_semaphore(url): + domain = urlparse(url).netloc + current_time = time.time() + + # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + self.logger.debug( + message="Started task for {url:.50}...", + tag="PARALLEL", + params={"url": url} + ) + + # Get delay settings from kwargs or use defaults + mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay + max_range = kwargs.get('max_range', 0.3) # 1 seconds default max additional delay + + # Check if we need to wait + if domain in self._domain_last_hit: + time_since_last = current_time - self._domain_last_hit[domain] + if time_since_last < mean_delay: + delay = mean_delay + random.uniform(0, max_range) + await asyncio.sleep(delay) + + # Update last hit time + self._domain_last_hit[domain] = current_time + + async with semaphore: + return await self.arun( + url, + word_count_threshold=word_count_threshold, + extraction_strategy=extraction_strategy, + chunking_strategy=chunking_strategy, + content_filter=content_filter, + cache_mode=cache_mode, + css_selector=css_selector, + screenshot=screenshot, + user_agent=user_agent, + verbose=verbose, + **kwargs, + ) + + # Print start message + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + self.logger.info( + message="Starting concurrent crawling for {count} URLs...", + tag="INIT", + params={"count": len(urls)} + ) + start_time = time.perf_counter() + tasks = [crawl_with_semaphore(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + end_time = time.perf_counter() + # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL, + tag="COMPLETE", + params={ + "count": len(urls), + "timing": f"{end_time - start_time:.2f}s" + }, + colors={"timing": Fore.YELLOW} + ) + return [result if not isinstance(result, Exception) else str(result) for result in results] + async def aprocess_html( self, @@ -180,33 +459,30 @@ class AsyncWebCrawler: word_count_threshold: int, extraction_strategy: ExtractionStrategy, chunking_strategy: ChunkingStrategy, + content_filter: RelevantContentFilter, css_selector: str, screenshot: str, verbose: bool, - is_cached: bool, **kwargs, ) -> CrawlResult: - t = time.time() # Extract content from HTML try: - t1 = time.time() - scrapping_strategy = WebScrappingStrategy() + _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" + t1 = time.perf_counter() + scrapping_strategy = WebScrapingStrategy() # result = await scrapping_strategy.ascrap( result = scrapping_strategy.scrap( url, html, word_count_threshold=word_count_threshold, css_selector=css_selector, - only_text=kwargs.get("only_text", False), - image_description_min_word_threshold=kwargs.get( + only_text=kwargs.pop("only_text", False), + image_description_min_word_threshold=kwargs.pop( "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ), + content_filter = content_filter, **kwargs, ) - if verbose: - print( - f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds" - ) if result is None: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") @@ -215,6 +491,8 @@ class AsyncWebCrawler: except Exception as e: raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") + markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None) + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) markdown = sanitize_input_encode(result.get("markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) @@ -222,13 +500,21 @@ class AsyncWebCrawler: media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) + + # if verbose: + # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + self.logger.info( + message="Processed {url:.50}... | Time: {timing}ms", + tag="SCRAPE", + params={ + "url": _url, + "timing": int((time.perf_counter() - t1) * 1000) + } + ) - if extracted_content is None and extraction_strategy and chunking_strategy: - if verbose: - print( - f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}" - ) + if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): + t1 = time.perf_counter() # Check if extraction strategy is type of JsonCssExtractionStrategy if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy): extraction_strategy.verbose = verbose @@ -238,32 +524,27 @@ class AsyncWebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - - if verbose: - print( - f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds." + # if verbose: + # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + self.logger.info( + message="Completed for {url:.50}... | Time: {timing}s", + tag="EXTRACT", + params={ + "url": _url, + "timing": time.perf_counter() - t1 + } ) + + + screenshot = None if not screenshot else screenshot - - if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache: - await async_db_manager.acache_url( - url, - html, - cleaned_html, - markdown, - extracted_content, - True, - json.dumps(media), - json.dumps(links), - json.dumps(metadata), - screenshot=screenshot, - ) - + return CrawlResult( url=url, html=html, cleaned_html=format_html(cleaned_html), + markdown_v2=markdown_v2, markdown=markdown, fit_markdown=fit_markdown, fit_html= fit_html, @@ -277,13 +558,15 @@ class AsyncWebCrawler: ) async def aclear_cache(self): - # await async_db_manager.aclear_db() + """Clear the cache database.""" await async_db_manager.cleanup() async def aflush_cache(self): + """Flush the cache database.""" await async_db_manager.aflush_db() async def aget_cache_size(self): + """Get the total number of cached items.""" return await async_db_manager.aget_total_count() diff --git a/crawl4ai/cache_context.py b/crawl4ai/cache_context.py new file mode 100644 index 00000000..429eacc1 --- /dev/null +++ b/crawl4ai/cache_context.py @@ -0,0 +1,79 @@ +from enum import Enum + + +class CacheMode(Enum): + """ + Defines the caching behavior for web crawling operations. + + Modes: + - ENABLED: Normal caching behavior (read and write) + - DISABLED: No caching at all + - READ_ONLY: Only read from cache, don't write + - WRITE_ONLY: Only write to cache, don't read + - BYPASS: Bypass cache for this operation + """ + ENABLED = "enabled" + DISABLED = "disabled" + READ_ONLY = "read_only" + WRITE_ONLY = "write_only" + BYPASS = "bypass" + + +class CacheContext: + """ + Encapsulates cache-related decisions and URL handling. + + This class centralizes all cache-related logic and URL type checking, + making the caching behavior more predictable and maintainable. + """ + def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False): + self.url = url + self.cache_mode = cache_mode + self.always_bypass = always_bypass + self.is_cacheable = url.startswith(('http://', 'https://', 'file://')) + self.is_web_url = url.startswith(('http://', 'https://')) + self.is_local_file = url.startswith("file://") + self.is_raw_html = url.startswith("raw:") + self._url_display = url if not self.is_raw_html else "Raw HTML" + + def should_read(self) -> bool: + """Determines if cache should be read based on context.""" + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY] + + def should_write(self) -> bool: + """Determines if cache should be written based on context.""" + if self.always_bypass or not self.is_cacheable: + return False + return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY] + + @property + def display_url(self) -> str: + """Returns the URL in display format.""" + return self._url_display + + +def _legacy_to_cache_mode( + disable_cache: bool = False, + bypass_cache: bool = False, + no_cache_read: bool = False, + no_cache_write: bool = False +) -> CacheMode: + """ + Converts legacy cache parameters to the new CacheMode enum. + + This is an internal function to help transition from the old boolean flags + to the new CacheMode system. + """ + if disable_cache: + return CacheMode.DISABLED + if bypass_cache: + return CacheMode.BYPASS + if no_cache_read and no_cache_write: + return CacheMode.DISABLED + if no_cache_read: + return CacheMode.WRITE_ONLY + if no_cache_write: + return CacheMode.READ_ONLY + return CacheMode.ENABLED diff --git a/crawl4ai/config.py b/crawl4ai/config.py index a07ca977..786ca4e5 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -51,3 +51,9 @@ SOCIAL_MEDIA_DOMAINS = [ # If image format is in jpg, png or webp # If image is in the first half of the total images extracted from the page IMAGE_SCORE_THRESHOLD = 2 + +MAX_METRICS_HISTORY = 1000 + +NEED_MIGRATION = True +URL_LOG_SHORTEN_LENGTH = 30 +SHOW_DEPRECATION_WARNINGS = True \ No newline at end of file diff --git a/crawl4ai/content_cleaning_strategy.py b/crawl4ai/content_cleaning_strategy.py deleted file mode 100644 index 2f052f76..00000000 --- a/crawl4ai/content_cleaning_strategy.py +++ /dev/null @@ -1,196 +0,0 @@ -from bs4 import BeautifulSoup, Tag -import re -from typing import Optional - -class ContentCleaningStrategy: - def __init__(self): - # Precompile regex patterns for performance - self.negative_patterns = re.compile(r'nav|footer|header|sidebar|ads|comment', re.I) - self.positive_patterns = re.compile(r'content|article|main|post', re.I) - self.priority_tags = {'article', 'main', 'section', 'div'} - self.non_content_tags = {'nav', 'footer', 'header', 'aside'} - # Thresholds - self.text_density_threshold = 9.0 - self.min_word_count = 50 - self.link_density_threshold = 0.2 - self.max_dom_depth = 10 # To prevent excessive DOM traversal - - def clean(self, clean_html: str) -> str: - """ - Main function that takes cleaned HTML and returns super cleaned HTML. - - Args: - clean_html (str): The cleaned HTML content. - - Returns: - str: The super cleaned HTML containing only the main content. - """ - try: - if not clean_html or not isinstance(clean_html, str): - return '' - soup = BeautifulSoup(clean_html, 'html.parser') - main_content = self.extract_main_content(soup) - if main_content: - super_clean_element = self.clean_element(main_content) - return str(super_clean_element) - else: - return '' - except Exception: - # Handle exceptions silently or log them as needed - return '' - - def extract_main_content(self, soup: BeautifulSoup) -> Optional[Tag]: - """ - Identifies and extracts the main content element from the HTML. - - Args: - soup (BeautifulSoup): The parsed HTML soup. - - Returns: - Optional[Tag]: The Tag object containing the main content, or None if not found. - """ - candidates = [] - for element in soup.find_all(self.priority_tags): - if self.is_non_content_tag(element): - continue - if self.has_negative_class_id(element): - continue - score = self.calculate_content_score(element) - candidates.append((score, element)) - - if not candidates: - return None - - # Sort candidates by score in descending order - candidates.sort(key=lambda x: x[0], reverse=True) - # Select the element with the highest score - best_element = candidates[0][1] - return best_element - - def calculate_content_score(self, element: Tag) -> float: - """ - Calculates a score for an element based on various heuristics. - - Args: - element (Tag): The HTML element to score. - - Returns: - float: The content score of the element. - """ - score = 0.0 - - if self.is_priority_tag(element): - score += 5.0 - if self.has_positive_class_id(element): - score += 3.0 - if self.has_negative_class_id(element): - score -= 3.0 - if self.is_high_text_density(element): - score += 2.0 - if self.is_low_link_density(element): - score += 2.0 - if self.has_sufficient_content(element): - score += 2.0 - if self.has_headings(element): - score += 3.0 - - dom_depth = self.calculate_dom_depth(element) - score += min(dom_depth, self.max_dom_depth) * 0.5 # Adjust weight as needed - - return score - - def is_priority_tag(self, element: Tag) -> bool: - """Checks if the element is a priority tag.""" - return element.name in self.priority_tags - - def is_non_content_tag(self, element: Tag) -> bool: - """Checks if the element is a non-content tag.""" - return element.name in self.non_content_tags - - def has_negative_class_id(self, element: Tag) -> bool: - """Checks if the element has negative indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.negative_patterns.search(class_id)) - - def has_positive_class_id(self, element: Tag) -> bool: - """Checks if the element has positive indicators in its class or id.""" - class_id = ' '.join(filter(None, [ - self.get_attr_str(element.get('class')), - element.get('id', '') - ])) - return bool(self.positive_patterns.search(class_id)) - - @staticmethod - def get_attr_str(attr) -> str: - """Converts an attribute value to a string.""" - if isinstance(attr, list): - return ' '.join(attr) - elif isinstance(attr, str): - return attr - else: - return '' - - def is_high_text_density(self, element: Tag) -> bool: - """Determines if the element has high text density.""" - text_density = self.calculate_text_density(element) - return text_density > self.text_density_threshold - - def calculate_text_density(self, element: Tag) -> float: - """Calculates the text density of an element.""" - text_length = len(element.get_text(strip=True)) - tag_count = len(element.find_all()) - tag_count = tag_count or 1 # Prevent division by zero - return text_length / tag_count - - def is_low_link_density(self, element: Tag) -> bool: - """Determines if the element has low link density.""" - link_density = self.calculate_link_density(element) - return link_density < self.link_density_threshold - - def calculate_link_density(self, element: Tag) -> float: - """Calculates the link density of an element.""" - text = element.get_text(strip=True) - if not text: - return 0.0 - link_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a')) - return len(link_text) / len(text) if text else 0.0 - - def has_sufficient_content(self, element: Tag) -> bool: - """Checks if the element has sufficient word count.""" - word_count = len(element.get_text(strip=True).split()) - return word_count >= self.min_word_count - - def calculate_dom_depth(self, element: Tag) -> int: - """Calculates the depth of an element in the DOM tree.""" - depth = 0 - current_element = element - while current_element.parent and depth < self.max_dom_depth: - depth += 1 - current_element = current_element.parent - return depth - - def has_headings(self, element: Tag) -> bool: - """Checks if the element contains heading tags.""" - return bool(element.find(['h1', 'h2', 'h3'])) - - def clean_element(self, element: Tag) -> Tag: - """ - Cleans the selected element by removing unnecessary attributes and nested non-content elements. - - Args: - element (Tag): The HTML element to clean. - - Returns: - Tag: The cleaned HTML element. - """ - for tag in element.find_all(['script', 'style', 'aside']): - tag.decompose() - for tag in element.find_all(): - attrs = dict(tag.attrs) - for attr in attrs: - if attr in ['style', 'onclick', 'onmouseover', 'align', 'bgcolor']: - del tag.attrs[attr] - return element diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py new file mode 100644 index 00000000..88216f7f --- /dev/null +++ b/crawl4ai/content_filter_strategy.py @@ -0,0 +1,328 @@ +import re +from bs4 import BeautifulSoup, Tag +from typing import List, Tuple, Dict +from rank_bm25 import BM25Okapi +from time import perf_counter +from collections import deque +from bs4 import BeautifulSoup, NavigableString, Tag +from .utils import clean_tokens +from abc import ABC, abstractmethod + +from snowballstemmer import stemmer + +# from nltk.stem import PorterStemmer +# ps = PorterStemmer() +class RelevantContentFilter(ABC): + def __init__(self, user_query: str = None): + self.user_query = user_query + self.included_tags = { + # Primary structure + 'article', 'main', 'section', 'div', + # List structures + 'ul', 'ol', 'li', 'dl', 'dt', 'dd', + # Text content + 'p', 'span', 'blockquote', 'pre', 'code', + # Headers + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + # Tables + 'table', 'thead', 'tbody', 'tr', 'td', 'th', + # Other semantic elements + 'figure', 'figcaption', 'details', 'summary', + # Text formatting + 'em', 'strong', 'b', 'i', 'mark', 'small', + # Rich content + 'time', 'address', 'cite', 'q' + } + self.excluded_tags = { + 'nav', 'footer', 'header', 'aside', 'script', + 'style', 'form', 'iframe', 'noscript' + } + self.header_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} + self.negative_patterns = re.compile( + r'nav|footer|header|sidebar|ads|comment|promo|advert|social|share', + re.I + ) + self.min_word_count = 2 + + @abstractmethod + def filter_content(self, html: str) -> List[str]: + """Abstract method to be implemented by specific filtering strategies""" + pass + + def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str: + """Common method to extract page metadata with fallbacks""" + if self.user_query: + return self.user_query + + query_parts = [] + + # Title + if soup.title: + query_parts.append(soup.title.string) + elif soup.find('h1'): + query_parts.append(soup.find('h1').get_text()) + + # Meta tags + temp = "" + for meta_name in ['keywords', 'description']: + meta = soup.find('meta', attrs={'name': meta_name}) + if meta and meta.get('content'): + query_parts.append(meta['content']) + temp += meta['content'] + + # If still empty, grab first significant paragraph + if not temp: + # Find the first tag P thatits text contains more than 50 characters + for p in body.find_all('p'): + if len(p.get_text()) > 150: + query_parts.append(p.get_text()[:150]) + break + + return ' '.join(filter(None, query_parts)) + + + def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]: + """ + Extracts text chunks from a BeautifulSoup body element while preserving order. + Returns list of tuples (text, tag_name) for classification. + + Args: + body: BeautifulSoup Tag object representing the body element + + Returns: + List of (text, tag_name) tuples + """ + # Tags to ignore - inline elements that shouldn't break text flow + INLINE_TAGS = { + 'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code', + 'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q', + 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', + 'textarea', 'time', 'tt', 'var' + } + + # Tags that typically contain meaningful headers + HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header'} + + chunks = [] + current_text = [] + chunk_index = 0 + + def should_break_chunk(tag: Tag) -> bool: + """Determine if a tag should cause a break in the current text chunk""" + return ( + tag.name not in INLINE_TAGS + and not (tag.name == 'p' and len(current_text) == 0) + ) + + # Use deque for efficient push/pop operations + stack = deque([(body, False)]) + + while stack: + element, visited = stack.pop() + + if visited: + # End of block element - flush accumulated text + if current_text and should_break_chunk(element): + text = ' '.join(''.join(current_text).split()) + if text: + tag_type = 'header' if element.name in HEADER_TAGS else 'content' + chunks.append((chunk_index, text, tag_type, element)) + chunk_index += 1 + current_text = [] + continue + + if isinstance(element, NavigableString): + if str(element).strip(): + current_text.append(str(element).strip()) + continue + + # Pre-allocate children to avoid multiple list operations + children = list(element.children) + if not children: + continue + + # Mark block for revisit after processing children + stack.append((element, True)) + + # Add children in reverse order for correct processing + for child in reversed(children): + if isinstance(child, (Tag, NavigableString)): + stack.append((child, False)) + + # Handle any remaining text + if current_text: + text = ' '.join(''.join(current_text).split()) + if text: + chunks.append((chunk_index, text, 'content', body)) + + return chunks + + + def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]: + """Common method for extracting text chunks""" + _text_cache = {} + def fast_text(element: Tag) -> str: + elem_id = id(element) + if elem_id in _text_cache: + return _text_cache[elem_id] + texts = [] + for content in element.contents: + if isinstance(content, str): + text = content.strip() + if text: + texts.append(text) + result = ' '.join(texts) + _text_cache[elem_id] = result + return result + + candidates = [] + index = 0 + + def dfs(element): + nonlocal index + if isinstance(element, Tag): + if element.name in self.included_tags: + if not self.is_excluded(element): + text = fast_text(element) + word_count = len(text.split()) + + # Headers pass through with adjusted minimum + if element.name in self.header_tags: + if word_count >= 3: # Minimal sanity check for headers + candidates.append((index, text, element)) + index += 1 + # Regular content uses standard minimum + elif word_count >= self.min_word_count: + candidates.append((index, text, element)) + index += 1 + + for child in element.children: + dfs(child) + + dfs(soup.body if soup.body else soup) + return candidates + + def is_excluded(self, tag: Tag) -> bool: + """Common method for exclusion logic""" + if tag.name in self.excluded_tags: + return True + class_id = ' '.join(filter(None, [ + ' '.join(tag.get('class', [])), + tag.get('id', '') + ])) + return bool(self.negative_patterns.search(class_id)) + + def clean_element(self, tag: Tag) -> str: + """Common method for cleaning HTML elements with minimal overhead""" + if not tag or not isinstance(tag, Tag): + return "" + + unwanted_tags = {'script', 'style', 'aside', 'form', 'iframe', 'noscript'} + unwanted_attrs = {'style', 'onclick', 'onmouseover', 'align', 'bgcolor', 'class', 'id'} + + # Use string builder pattern for better performance + builder = [] + + def render_tag(elem): + if not isinstance(elem, Tag): + if isinstance(elem, str): + builder.append(elem.strip()) + return + + if elem.name in unwanted_tags: + return + + # Start tag + builder.append(f'<{elem.name}') + + # Add cleaned attributes + attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs} + for key, value in attrs.items(): + builder.append(f' {key}="{value}"') + + builder.append('>') + + # Process children + for child in elem.children: + render_tag(child) + + # Close tag + builder.append(f'') + + try: + render_tag(tag) + return ''.join(builder) + except Exception: + return str(tag) # Fallback to original if anything fails + +class BM25ContentFilter(RelevantContentFilter): + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'): + super().__init__(user_query=user_query) + self.bm25_threshold = bm25_threshold + self.priority_tags = { + 'h1': 5.0, + 'h2': 4.0, + 'h3': 3.0, + 'title': 4.0, + 'strong': 2.0, + 'b': 1.5, + 'em': 1.5, + 'blockquote': 2.0, + 'code': 2.0, + 'pre': 1.5, + 'th': 1.5, # Table headers + } + self.stemmer = stemmer(language) + + def filter_content(self, html: str) -> List[str]: + """Implements content filtering using BM25 algorithm with priority tag handling""" + if not html or not isinstance(html, str): + return [] + + soup = BeautifulSoup(html, 'lxml') + body = soup.find('body') + query = self.extract_page_query(soup.find('head'), body) + candidates = self.extract_text_chunks(body) + + if not candidates: + return [] + + # Tokenize corpus + # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates] + # tokenized_query = query.lower().split() + + # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] + # for _, chunk, _, _ in candidates] + # tokenized_query = [ps.stem(word) for word in query.lower().split()] + + tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] + for _, chunk, _, _ in candidates] + tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()] + + # Clean from stop words and noise + tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] + tokenized_query = clean_tokens(tokenized_query) + + bm25 = BM25Okapi(tokenized_corpus) + scores = bm25.get_scores(tokenized_query) + + # Adjust scores with tag weights + adjusted_candidates = [] + for score, (index, chunk, tag_type, tag) in zip(scores, candidates): + tag_weight = self.priority_tags.get(tag.name, 1.0) + adjusted_score = score * tag_weight + adjusted_candidates.append((adjusted_score, index, chunk, tag)) + + # Filter candidates by threshold + selected_candidates = [ + (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates + if adjusted_score >= self.bm25_threshold + ] + + if not selected_candidates: + return [] + + # Sort selected candidates by original document order + selected_candidates.sort(key=lambda x: x[0]) + + return [self.clean_element(tag) for _, _, tag in selected_candidates] diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scraping_strategy.py similarity index 57% rename from crawl4ai/content_scrapping_strategy.py rename to crawl4ai/content_scraping_strategy.py index caed7319..d4b901d2 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -1,5 +1,6 @@ +import re # Point 1: Pre-Compile Regular Expressions from abc import ABC, abstractmethod -from typing import Dict, Any +from typing import Dict, Any, Optional from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor import asyncio, requests, re, os @@ -7,105 +8,54 @@ from .config import * from bs4 import element, NavigableString, Comment from urllib.parse import urljoin from requests.exceptions import InvalidSchema -from .content_cleaning_strategy import ContentCleaningStrategy - +# from .content_cleaning_strategy import ContentCleaningStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter +from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy +from .models import MarkdownGenerationResult from .utils import ( sanitize_input_encode, sanitize_html, extract_metadata, InvalidCSSSelectorError, - # CustomHTML2Text, + CustomHTML2Text, normalize_url, - is_external_url - + is_external_url ) +from .tools import profile_and_time -from .html2text import HTML2Text -class CustomHTML2Text(HTML2Text): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.inside_pre = False - self.inside_code = False - self.preserve_tags = set() # Set of tags to preserve - self.current_preserved_tag = None - self.preserved_content = [] - self.preserve_depth = 0 - - # Configuration options - self.skip_internal_links = False - self.single_line_break = False - self.mark_code = False - self.include_sup_sub = False - self.body_width = 0 - self.ignore_mailto_links = True - self.ignore_links = False - self.escape_backslash = False - self.escape_dot = False - self.escape_plus = False - self.escape_dash = False - self.escape_snob = False +# Pre-compile regular expressions for Open Graph and Twitter metadata +OG_REGEX = re.compile(r'^og:') +TWITTER_REGEX = re.compile(r'^twitter:') +DIMENSION_REGEX = re.compile(r"(\d+)(\D*)") - def update_params(self, **kwargs): - """Update parameters and set preserved tags.""" - for key, value in kwargs.items(): - if key == 'preserve_tags': - self.preserve_tags = set(value) - else: - setattr(self, key, value) +# Function to parse image height/width value and units +def parse_dimension(dimension): + if dimension: + # match = re.match(r"(\d+)(\D*)", dimension) + match = DIMENSION_REGEX.match(dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None - def handle_tag(self, tag, attrs, start): - # Handle preserved tags - if tag in self.preserve_tags: - if start: - if self.preserve_depth == 0: - self.current_preserved_tag = tag - self.preserved_content = [] - # Format opening tag with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - self.preserve_depth += 1 - return - else: - self.preserve_depth -= 1 - if self.preserve_depth == 0: - self.preserved_content.append(f'') - # Output the preserved HTML block with proper spacing - preserved_html = ''.join(self.preserved_content) - self.o('\n' + preserved_html + '\n') - self.current_preserved_tag = None - return - - # If we're inside a preserved tag, collect all content - if self.preserve_depth > 0: - if start: - # Format nested tags with attributes - attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) - self.preserved_content.append(f'<{tag}{attr_str}>') - else: - self.preserved_content.append(f'') - return - - # Handle pre tags - if tag == 'pre': - if start: - self.o('```\n') - self.inside_pre = True - else: - self.o('\n```') - self.inside_pre = False - # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: - # pass +# Fetch image file metadata to extract size and extension +def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) else: - super().handle_tag(tag, attrs, start) + print(f"Failed to retrieve file size for {img_url}") + return None + except InvalidSchema as e: + return None + finally: + return - def handle_data(self, data, entity_char=False): - """Override handle_data to capture content within preserved tags.""" - if self.preserve_depth > 0: - self.preserved_content.append(data) - return - super().handle_data(data, entity_char) - -class ContentScrappingStrategy(ABC): +class ContentScrapingStrategy(ABC): @abstractmethod def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: pass @@ -114,21 +64,127 @@ class ContentScrappingStrategy(ABC): async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: pass -class WebScrappingStrategy(ContentScrappingStrategy): +class WebScrapingStrategy(ContentScrapingStrategy): + def __init__(self, logger=None): + self.logger = logger + + def _log(self, level, message, tag="SCRAPE", **kwargs): + """Helper method to safely use logger.""" + if self.logger: + log_method = getattr(self.logger, level) + log_method(message=message, tag=tag, **kwargs) + def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return await asyncio.to_thread(self._get_content_of_website_optimized, url, html, **kwargs) + + def _generate_markdown_content(self, + cleaned_html: str, + html: str, + url: str, + success: bool, + **kwargs) -> Dict[str, Any]: + """Generate markdown content using either new strategy or legacy method. + + Args: + cleaned_html: Sanitized HTML content + html: Original HTML content + url: Base URL of the page + success: Whether scraping was successful + **kwargs: Additional options including: + - markdown_generator: Optional[MarkdownGenerationStrategy] + - html2text: Dict[str, Any] options for HTML2Text + - content_filter: Optional[RelevantContentFilter] + - fit_markdown: bool + - fit_markdown_user_query: Optional[str] + - fit_markdown_bm25_threshold: float + + Returns: + Dict containing markdown content in various formats + """ + markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerationStrategy()) + + if markdown_generator: + try: + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( + cleaned_html=cleaned_html, + base_url=url, + html2text_options=kwargs.get('html2text', {}), + content_filter=kwargs.get('content_filter', None) + ) + + return { + 'markdown': markdown_result.raw_markdown, + 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': markdown_result + } + except Exception as e: + self._log('error', + message="Error using new markdown generation strategy: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + markdown_generator = None + + # Legacy method + h = CustomHTML2Text() + h.update_params(**kwargs.get('html2text', {})) + markdown = h.handle(cleaned_html) + markdown = markdown.replace(' ```', '```') + + fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content." + fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content." + + if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False): + content_filter = kwargs.get('content_filter', None) + if not content_filter: + content_filter = BM25ContentFilter( + user_query=kwargs.get('fit_markdown_user_query', None), + bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0) + ) + fit_html = content_filter.filter_content(html) + fit_html = '\n'.join('
{}
'.format(s) for s in fit_html) + fit_markdown = h.handle(fit_html) + + markdown_v2 = MarkdownGenerationResult( + raw_markdown=markdown, + markdown_with_citations=markdown, + references_markdown=markdown, + fit_markdown=fit_markdown + ) + + return { + 'markdown': markdown, + 'fit_markdown': fit_markdown, + 'fit_html': fit_html, + 'markdown_v2' : markdown_v2 + } + + def _get_content_of_website_optimized(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: success = True if not html: return None - soup = BeautifulSoup(html, 'html.parser') + # soup = BeautifulSoup(html, 'html.parser') + soup = BeautifulSoup(html, 'lxml') body = soup.body + try: + meta = extract_metadata("", soup) + except Exception as e: + self._log('error', + message="Error extracting metadata: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # print('Error extracting metadata:', str(e)) + meta = {} + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) @@ -171,7 +227,26 @@ class WebScrappingStrategy(ContentScrappingStrategy): return text_content return None - def process_image(img, url, index, total_images): + def process_image_old(img, url, index, total_images): + def parse_srcset(srcset_str): + """Parse srcset attribute into list of image URLs with their sizes.""" + if not srcset_str: + return [] + + sources = [] + # Split on http/https and filter empty strings + urls = [f"http{part}" for part in srcset_str.split("http") if part] + + for url in urls: + # Remove trailing comma and whitespace, then split to get width + url = url.strip().rstrip(',') + parts = url.rsplit(' ', 1) + img_url = parts[0].strip() + width = parts[1].rstrip('w') if len(parts) > 1 else None + sources.append({'url': img_url, 'width': width}) + + return sources + #Check if an image has valid display and inside undesired html elements def is_valid_image(img, parent, parent_classes): style = img.get('style', '') @@ -187,32 +262,6 @@ class WebScrappingStrategy(ContentScrappingStrategy): #Score an image for it's usefulness def score_image_for_usefulness(img, base_url, index, images_count): - # Function to parse image height/width value and units - def parse_dimension(dimension): - if dimension: - match = re.match(r"(\d+)(\D*)", dimension) - if match: - number = int(match.group(1)) - unit = match.group(2) or 'px' # Default unit is 'px' if not specified - return number, unit - return None, None - - # Fetch image file metadata to extract size and extension - def fetch_image_file_size(img, base_url): - #If src is relative path construct full URL, if not it may be CDN URL - img_url = urljoin(base_url,img.get('src')) - try: - response = requests.head(img_url) - if response.status_code == 200: - return response.headers.get('Content-Length',None) - else: - print(f"Failed to retrieve file size for {img_url}") - return None - except InvalidSchema as e: - return None - finally: - return - image_height = img.get('height') height_value, height_unit = parse_dimension(image_height) image_width = img.get('width') @@ -246,14 +295,14 @@ class WebScrappingStrategy(ContentScrappingStrategy): score+=1 return score - - if not is_valid_image(img, img.parent, img.parent.get('class', [])): return None + score = score_image_for_usefulness(img, url, index, total_images) - if score <= IMAGE_SCORE_THRESHOLD: + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): return None - return { + + base_result = { 'src': img.get('src', ''), 'data-src': img.get('data-src', ''), 'alt': img.get('alt', ''), @@ -262,6 +311,109 @@ class WebScrappingStrategy(ContentScrappingStrategy): 'type': 'image' } + sources = [] + srcset = img.get('srcset', '') + if srcset: + sources = parse_srcset(srcset) + if sources: + return [dict(base_result, src=source['url'], width=source['width']) + for source in sources] + + return [base_result] # Always return a list + + def process_image(img, url, index, total_images): + parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') + if ' ' in u else None} + for u in [f"http{p}" for p in s.split("http") if p]] + + # Constants for checks + classes_to_check = frozenset(['button', 'icon', 'logo']) + tags_to_check = frozenset(['button', 'input']) + + # Pre-fetch commonly used attributes + style = img.get('style', '') + alt = img.get('alt', '') + src = img.get('src', '') + data_src = img.get('data-src', '') + width = img.get('width') + height = img.get('height') + parent = img.parent + parent_classes = parent.get('class', []) + + # Quick validation checks + if ('display:none' in style or + parent.name in tags_to_check or + any(c in cls for c in parent_classes for cls in classes_to_check) or + any(c in src for c in classes_to_check) or + any(c in alt for c in classes_to_check)): + return None + + # Quick score calculation + score = 0 + if width and width.isdigit(): + width_val = int(width) + score += 1 if width_val > 150 else 0 + if height and height.isdigit(): + height_val = int(height) + score += 1 if height_val > 150 else 0 + if alt: + score += 1 + score += index/total_images < 0.5 + + image_format = '' + if "data:image/" in src: + image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0] + else: + image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0] + + if image_format in ('jpg', 'png', 'webp', 'avif'): + score += 1 + + if score <= kwargs.get('image_score_threshold', IMAGE_SCORE_THRESHOLD): + return None + + # Use set for deduplication + unique_urls = set() + image_variants = [] + + # Base image info template + base_info = { + 'alt': alt, + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } + + # Inline function for adding variants + def add_variant(src, width=None): + if src and not src.startswith('data:') and src not in unique_urls: + unique_urls.add(src) + image_variants.append({**base_info, 'src': src, 'width': width}) + + # Process all sources + add_variant(src) + add_variant(data_src) + + # Handle srcset and data-srcset in one pass + for attr in ('srcset', 'data-srcset'): + if value := img.get(attr): + for source in parse_srcset(value): + add_variant(source['url'], source['width']) + + # Quick picture element check + if picture := img.find_parent('picture'): + for source in picture.find_all('source'): + if srcset := source.get('srcset'): + for src in parse_srcset(srcset): + add_variant(src['url'], src['width']) + + # Framework-specific attributes in one pass + for attr, value in img.attrs.items(): + if attr.startswith('data-') and ('src' in attr or 'srcset' in attr) and 'http' in value: + add_variant(value) + + return image_variants if image_variants else None + def remove_unwanted_attributes(element, important_attrs, keep_data_attributes=False): attrs_to_remove = [] for attr in element.attrs: @@ -294,7 +446,6 @@ class WebScrappingStrategy(ContentScrappingStrategy): exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', []) exclude_social_media_domains = list(set(exclude_social_media_domains)) - try: if element.name == 'a' and element.get('href'): @@ -414,9 +565,12 @@ class WebScrappingStrategy(ContentScrappingStrategy): try: remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) except Exception as e: - print('Error removing unwanted attributes:', str(e)) - - + # print('Error removing unwanted attributes:', str(e)) + self._log('error', + message="Error removing unwanted attributes: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) # Process children for child in list(element.children): if isinstance(child, NavigableString) and not isinstance(child, Comment): @@ -437,30 +591,30 @@ class WebScrappingStrategy(ContentScrappingStrategy): return keep_element except Exception as e: - print('Error processing element:', str(e)) + # print('Error processing element:', str(e)) + self._log('error', + message="Error processing element: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) return False - - #process images by filtering and extracting contextual text from the page - # imgs = body.find_all('img') - # media['images'] = [ - # result for result in - # (process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs)) - # if result is not None - # ] - + process_element(body) # Update the links dictionary with unique links links['internal'] = list(internal_links_dict.values()) links['external'] = list(external_links_dict.values()) - # # Process images using ThreadPoolExecutor imgs = body.find_all('img') - with ThreadPoolExecutor() as executor: - image_results = list(executor.map(process_image, imgs, [url]*len(imgs), range(len(imgs)), [len(imgs)]*len(imgs))) - media['images'] = [result for result in image_results if result is not None] + # For test we use for loop instead of thread + media['images'] = [ + img for result in (process_image(img, url, i, len(imgs)) + for i, img in enumerate(imgs)) + if result is not None + for img in result + ] def flatten_nested_elements(node): if isinstance(node, NavigableString): @@ -478,8 +632,9 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Replace base64 data with empty string img['src'] = base64_pattern.sub('', src) + str_body = "" try: - str(body) + str_body = body.encode_contents().decode('utf-8') except Exception as e: # Reset body to the original HTML success = False @@ -504,35 +659,26 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Append the error div to the body body.body.append(error_div) + str_body = body.encode_contents().decode('utf-8') print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") + self._log('error', + message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.", + tag="SCRAPE" + ) + cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') - cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') - - try: - h = CustomHTML2Text() - h.update_params(**kwargs.get('html2text', {})) - markdown = h.handle(cleaned_html) - except Exception as e: - markdown = h.handle(sanitize_html(cleaned_html)) - markdown = markdown.replace(' ```', '```') - - try: - meta = extract_metadata(html, soup) - except Exception as e: - print('Error extracting metadata:', str(e)) - meta = {} - - cleaner = ContentCleaningStrategy() - fit_html = cleaner.clean(cleaned_html) - fit_markdown = h.handle(fit_html) - - cleaned_html = sanitize_html(cleaned_html) + markdown_content = self._generate_markdown_content( + cleaned_html=cleaned_html, + html=html, + url=url, + success=success, + **kwargs + ) + return { - 'markdown': markdown, - 'fit_markdown': fit_markdown, - 'fit_html': fit_html, + **markdown_content, 'cleaned_html': cleaned_html, 'success': success, 'media': media, diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index ce802e49..898dcfa8 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -283,7 +283,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): print(f"[LOG] ✅ Crawled {url} successfully!") return html - except InvalidArgumentException: + except InvalidArgumentException as e: if not hasattr(e, 'msg'): e.msg = sanitize_input_encode(str(e)) raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py new file mode 100644 index 00000000..7922c413 --- /dev/null +++ b/crawl4ai/markdown_generation_strategy.py @@ -0,0 +1,116 @@ +from abc import ABC, abstractmethod +from typing import Optional, Dict, Any, Tuple +from .models import MarkdownGenerationResult +from .utils import CustomHTML2Text +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter +import re +from urllib.parse import urljoin + +# Pre-compile the regex pattern +LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') + +class MarkdownGenerationStrategy(ABC): + """Abstract base class for markdown generation strategies.""" + + @abstractmethod + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """Generate markdown from cleaned HTML.""" + pass + +class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): + """Default implementation of markdown generation strategy.""" + + def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: + link_map = {} + url_cache = {} # Cache for URL joins + parts = [] + last_end = 0 + counter = 1 + + for match in LINK_PATTERN.finditer(markdown): + parts.append(markdown[last_end:match.start()]) + text, url, title = match.groups() + + # Use cached URL if available, otherwise compute and cache + if base_url and not url.startswith(('http://', 'https://', 'mailto:')): + if url not in url_cache: + url_cache[url] = fast_urljoin(base_url, url) + url = url_cache[url] + + if url not in link_map: + desc = [] + if title: desc.append(title) + if text and text != title: desc.append(text) + link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") + counter += 1 + + num = link_map[url][0] + parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") + last_end = match.end() + + parts.append(markdown[last_end:]) + converted_text = ''.join(parts) + + # Pre-build reference strings + references = ["\n\n## References\n\n"] + references.extend( + f"⟨{num}⟩ {url}{desc}\n" + for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) + ) + + return converted_text, ''.join(references) + + def generate_markdown(self, + cleaned_html: str, + base_url: str = "", + html2text_options: Optional[Dict[str, Any]] = None, + content_filter: Optional[RelevantContentFilter] = None, + citations: bool = True, + **kwargs) -> MarkdownGenerationResult: + """Generate markdown with citations from cleaned HTML.""" + # Initialize HTML2Text with options + h = CustomHTML2Text() + if html2text_options: + h.update_params(**html2text_options) + + # Generate raw markdown + raw_markdown = h.handle(cleaned_html) + raw_markdown = raw_markdown.replace(' ```', '```') + + # Convert links to citations + if citations: + markdown_with_citations, references_markdown = self.convert_links_to_citations( + raw_markdown, base_url + ) + + # Generate fit markdown if content filter is provided + fit_markdown: Optional[str] = None + if content_filter: + filtered_html = content_filter.filter_content(cleaned_html) + filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) + fit_markdown = h.handle(filtered_html) + + return MarkdownGenerationResult( + raw_markdown=raw_markdown, + markdown_with_citations=markdown_with_citations, + references_markdown=references_markdown, + fit_markdown=fit_markdown, + fit_html=filtered_html + ) + +def fast_urljoin(base: str, url: str) -> str: + """Fast URL joining for common cases.""" + if url.startswith(('http://', 'https://', 'mailto:', '//')): + return url + if url.startswith('/'): + # Handle absolute paths + if base.endswith('/'): + return base[:-1] + url + return base + url + return urljoin(base, url) \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py new file mode 100644 index 00000000..77616086 --- /dev/null +++ b/crawl4ai/migrations.py @@ -0,0 +1,152 @@ +import os +import asyncio +import logging +from pathlib import Path +import aiosqlite +from typing import Optional +import xxhash +import aiofiles +import shutil +import time +from datetime import datetime + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class DatabaseMigration: + def __init__(self, db_path: str): + self.db_path = db_path + self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path)) + + def _ensure_content_dirs(self, base_path: str) -> dict: + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots' + } + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + return content_paths + + def _generate_content_hash(self, content: str) -> str: + x = xxhash.xxh64() + x.update(content.encode()) + content_hash = x.hexdigest() + return content_hash + # return hashlib.sha256(content.encode()).hexdigest() + + async def _store_content(self, content: str, content_type: str) -> str: + if not content: + return "" + + content_hash = self._generate_content_hash(content) + file_path = os.path.join(self.content_paths[content_type], content_hash) + + if not os.path.exists(file_path): + async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: + await f.write(content) + + return content_hash + + async def migrate_database(self): + """Migrate existing database to file-based storage""" + logger.info("Starting database migration...") + + try: + async with aiosqlite.connect(self.db_path) as db: + # Get all rows + async with db.execute( + '''SELECT url, html, cleaned_html, markdown, + extracted_content, screenshot FROM crawled_data''' + ) as cursor: + rows = await cursor.fetchall() + + migrated_count = 0 + for row in rows: + url, html, cleaned_html, markdown, extracted_content, screenshot = row + + # Store content in files and get hashes + html_hash = await self._store_content(html, 'html') + cleaned_hash = await self._store_content(cleaned_html, 'cleaned') + markdown_hash = await self._store_content(markdown, 'markdown') + extracted_hash = await self._store_content(extracted_content, 'extracted') + screenshot_hash = await self._store_content(screenshot, 'screenshots') + + # Update database with hashes + await db.execute(''' + UPDATE crawled_data + SET html = ?, + cleaned_html = ?, + markdown = ?, + extracted_content = ?, + screenshot = ? + WHERE url = ? + ''', (html_hash, cleaned_hash, markdown_hash, + extracted_hash, screenshot_hash, url)) + + migrated_count += 1 + if migrated_count % 100 == 0: + logger.info(f"Migrated {migrated_count} records...") + + await db.commit() + logger.info(f"Migration completed. {migrated_count} records processed.") + + except Exception as e: + logger.error(f"Migration failed: {e}") + raise + +async def backup_database(db_path: str) -> str: + """Create backup of existing database""" + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping backup.") + return None + + # Create backup with timestamp + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + backup_path = f"{db_path}.backup_{timestamp}" + + try: + # Wait for any potential write operations to finish + await asyncio.sleep(1) + + # Create backup + shutil.copy2(db_path, backup_path) + logger.info(f"Database backup created at: {backup_path}") + return backup_path + except Exception as e: + logger.error(f"Backup failed: {e}") + raise + +async def run_migration(db_path: Optional[str] = None): + """Run database migration""" + if db_path is None: + db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") + + if not os.path.exists(db_path): + logger.info("No existing database found. Skipping migration.") + return + + # Create backup first + backup_path = await backup_database(db_path) + if not backup_path: + return + + migration = DatabaseMigration(db_path) + await migration.migrate_database() + +def main(): + """CLI entry point for migration""" + import argparse + parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage') + parser.add_argument('--db-path', help='Custom database path') + args = parser.parse_args() + + asyncio.run(run_migration(args.db_path)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 4ac06797..3a1b8bd1 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -1,10 +1,19 @@ from pydantic import BaseModel, HttpUrl -from typing import List, Dict, Optional +from typing import List, Dict, Optional, Callable, Awaitable, Union + + class UrlModel(BaseModel): url: HttpUrl forced: bool = False +class MarkdownGenerationResult(BaseModel): + raw_markdown: str + markdown_with_citations: str + references_markdown: str + fit_markdown: Optional[str] = None + fit_html: Optional[str] = None + class CrawlResult(BaseModel): url: str html: str @@ -12,8 +21,10 @@ class CrawlResult(BaseModel): cleaned_html: Optional[str] = None media: Dict[str, List[Dict]] = {} links: Dict[str, List[Dict]] = {} + downloaded_files: Optional[List[str]] = None screenshot: Optional[str] = None - markdown: Optional[str] = None + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + markdown_v2: Optional[MarkdownGenerationResult] = None fit_markdown: Optional[str] = None fit_html: Optional[str] = None extracted_content: Optional[str] = None @@ -21,4 +32,17 @@ class CrawlResult(BaseModel): error_message: Optional[str] = None session_id: Optional[str] = None response_headers: Optional[dict] = None - status_code: Optional[int] = None \ No newline at end of file + status_code: Optional[int] = None + +class AsyncCrawlResponse(BaseModel): + html: str + response_headers: Dict[str, str] + status_code: int + screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + downloaded_files: Optional[List[str]] = None + + class Config: + arbitrary_types_allowed = True + + diff --git a/crawl4ai/tools.py b/crawl4ai/tools.py new file mode 100644 index 00000000..ff36b53a --- /dev/null +++ b/crawl4ai/tools.py @@ -0,0 +1,34 @@ +import time +import cProfile +import pstats +from functools import wraps + +def profile_and_time(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + # Start timer + start_time = time.perf_counter() + + # Setup profiler + profiler = cProfile.Profile() + profiler.enable() + + # Run function + result = func(self, *args, **kwargs) + + # Stop profiler + profiler.disable() + + # Calculate elapsed time + elapsed_time = time.perf_counter() - start_time + + # Print timing + print(f"[PROFILER] Scraping completed in {elapsed_time:.2f} seconds") + + # Print profiling stats + stats = pstats.Stats(profiler) + stats.sort_stats('cumulative') # Sort by cumulative time + stats.print_stats(20) # Print top 20 time-consuming functions + + return result + return wrapper \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 1f15dea1..b07562df 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -14,6 +14,97 @@ from typing import Dict, Any from urllib.parse import urljoin import requests from requests.exceptions import InvalidSchema +import hashlib +from typing import Optional, Tuple, Dict, Any +import xxhash + + +from .html2text import HTML2Text +class CustomHTML2Text(HTML2Text): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.inside_pre = False + self.inside_code = False + self.preserve_tags = set() # Set of tags to preserve + self.current_preserved_tag = None + self.preserved_content = [] + self.preserve_depth = 0 + + # Configuration options + self.skip_internal_links = False + self.single_line_break = False + self.mark_code = False + self.include_sup_sub = False + self.body_width = 0 + self.ignore_mailto_links = True + self.ignore_links = False + self.escape_backslash = False + self.escape_dot = False + self.escape_plus = False + self.escape_dash = False + self.escape_snob = False + + def update_params(self, **kwargs): + """Update parameters and set preserved tags.""" + for key, value in kwargs.items(): + if key == 'preserve_tags': + self.preserve_tags = set(value) + else: + setattr(self, key, value) + + def handle_tag(self, tag, attrs, start): + # Handle preserved tags + if tag in self.preserve_tags: + if start: + if self.preserve_depth == 0: + self.current_preserved_tag = tag + self.preserved_content = [] + # Format opening tag with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + self.preserve_depth += 1 + return + else: + self.preserve_depth -= 1 + if self.preserve_depth == 0: + self.preserved_content.append(f'') + # Output the preserved HTML block with proper spacing + preserved_html = ''.join(self.preserved_content) + self.o('\n' + preserved_html + '\n') + self.current_preserved_tag = None + return + + # If we're inside a preserved tag, collect all content + if self.preserve_depth > 0: + if start: + # Format nested tags with attributes + attr_str = ''.join(f' {k}="{v}"' for k, v in attrs.items() if v is not None) + self.preserved_content.append(f'<{tag}{attr_str}>') + else: + self.preserved_content.append(f'') + return + + # Handle pre tags + if tag == 'pre': + if start: + self.o('```\n') + self.inside_pre = True + else: + self.o('\n```') + self.inside_pre = False + # elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: + # pass + else: + super().handle_tag(tag, attrs, start) + + def handle_data(self, data, entity_char=False): + """Override handle_data to capture content within preserved tags.""" + if self.preserve_depth > 0: + self.preserved_content.append(data) + return + super().handle_data(data, entity_char) + + class InvalidCSSSelectorError(Exception): pass @@ -736,46 +827,54 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: 'metadata': meta } -def extract_metadata(html, soup = None): +def extract_metadata(html, soup=None): metadata = {} - if not html: + if not html and not soup: + return {} + + if not soup: + soup = BeautifulSoup(html, 'lxml') + + head = soup.head + if not head: return metadata - # Parse HTML content with BeautifulSoup - if not soup: - soup = BeautifulSoup(html, 'html.parser') - # Title - title_tag = soup.find('title') - metadata['title'] = title_tag.string if title_tag else None + title_tag = head.find('title') + metadata['title'] = title_tag.string.strip() if title_tag and title_tag.string else None # Meta description - description_tag = soup.find('meta', attrs={'name': 'description'}) - metadata['description'] = description_tag['content'] if description_tag else None + description_tag = head.find('meta', attrs={'name': 'description'}) + metadata['description'] = description_tag.get('content', '').strip() if description_tag else None # Meta keywords - keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) - metadata['keywords'] = keywords_tag['content'] if keywords_tag else None + keywords_tag = head.find('meta', attrs={'name': 'keywords'}) + metadata['keywords'] = keywords_tag.get('content', '').strip() if keywords_tag else None # Meta author - author_tag = soup.find('meta', attrs={'name': 'author'}) - metadata['author'] = author_tag['content'] if author_tag else None + author_tag = head.find('meta', attrs={'name': 'author'}) + metadata['author'] = author_tag.get('content', '').strip() if author_tag else None # Open Graph metadata - og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')}) + og_tags = head.find_all('meta', attrs={'property': re.compile(r'^og:')}) for tag in og_tags: - property_name = tag['property'] - metadata[property_name] = tag['content'] + property_name = tag.get('property', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content # Twitter Card metadata - twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')}) + twitter_tags = head.find_all('meta', attrs={'name': re.compile(r'^twitter:')}) for tag in twitter_tags: - property_name = tag['name'] - metadata[property_name] = tag['content'] - + property_name = tag.get('name', '').strip() + content = tag.get('content', '').strip() + if property_name and content: + metadata[property_name] = content + return metadata + def extract_xml_tags(string): tags = re.findall(r'<(\w+)>', string) return list(set(tags)) @@ -1046,3 +1145,82 @@ def is_external_url(url, base_domain): return False return False + +def clean_tokens(tokens: list[str]) -> list[str]: + # Set of tokens to remove + noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'} + + STOP_WORDS = { + 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', + 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', + 'to', 'was', 'were', 'will', 'with', + + # Pronouns + 'i', 'you', 'he', 'she', 'it', 'we', 'they', + 'me', 'him', 'her', 'us', 'them', + 'my', 'your', 'his', 'her', 'its', 'our', 'their', + 'mine', 'yours', 'hers', 'ours', 'theirs', + 'myself', 'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', + + # Common verbs + 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', + 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', + + # Prepositions + 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', + 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', + 'by', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', + 'near', 'of', 'off', 'on', 'out', 'outside', 'over', 'past', 'through', + 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', + + # Conjunctions + 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', + 'although', 'because', 'since', 'unless', + + # Articles + 'a', 'an', 'the', + + # Other common words + 'this', 'that', 'these', 'those', + 'what', 'which', 'who', 'whom', 'whose', + 'when', 'where', 'why', 'how', + 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', + 'can', 'cannot', "can't", 'could', "couldn't", + 'may', 'might', 'must', "mustn't", + 'shall', 'should', "shouldn't", + 'will', "won't", 'would', "wouldn't", + 'not', "n't", 'no', 'nor', 'none' + } + + # Single comprehension, more efficient than multiple passes + return [token for token in tokens + if len(token) > 2 + and token not in noise + and token not in STOP_WORDS + and not token.startswith('↑') + and not token.startswith('▲') + and not token.startswith('⬆')] + + +def generate_content_hash(content: str) -> str: + """Generate a unique hash for content""" + return xxhash.xxh64(content.encode()).hexdigest() + # return hashlib.sha256(content.encode()).hexdigest() + +def ensure_content_dirs(base_path: str) -> Dict[str, str]: + """Create content directories if they don't exist""" + dirs = { + 'html': 'html_content', + 'cleaned': 'cleaned_html', + 'markdown': 'markdown_content', + 'extracted': 'extracted_content', + 'screenshots': 'screenshots' + } + + content_paths = {} + for key, dirname in dirs.items(): + path = os.path.join(base_path, dirname) + os.makedirs(path, exist_ok=True) + content_paths[key] = path + + return content_paths \ No newline at end of file diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py new file mode 100644 index 00000000..8ae2de2e --- /dev/null +++ b/crawl4ai/version_manager.py @@ -0,0 +1,30 @@ +# version_manager.py +import os +from pathlib import Path +from packaging import version +from . import __version__ + +class VersionManager: + def __init__(self): + self.home_dir = Path.home() / ".crawl4ai" + self.version_file = self.home_dir / "version.txt" + + def get_installed_version(self): + """Get the version recorded in home directory""" + if not self.version_file.exists(): + return None + try: + return version.parse(self.version_file.read_text().strip()) + except: + return None + + def update_version(self): + """Update the version file to current library version""" + self.version_file.write_text(__version__.__version__) + + def needs_update(self): + """Check if database needs update based on version""" + installed = self.get_installed_version() + current = version.parse(__version__.__version__) + return installed is None or installed < current + diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index d44de183..a32a988d 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -10,6 +10,7 @@ from .extraction_strategy import * from .crawler_strategy import * from typing import List from concurrent.futures import ThreadPoolExecutor +from .content_scraping_strategy import WebScrapingStrategy from .config import * import warnings import json @@ -181,7 +182,21 @@ class WebCrawler: # Extract content from HTML try: t1 = time.time() - result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) + scrapping_strategy = WebScrapingStrategy() + extra_params = {k: v for k, v in kwargs.items() if k not in ["only_text", "image_description_min_word_threshold"]} + result = scrapping_strategy.scrap( + url, + html, + word_count_threshold=word_count_threshold, + css_selector=css_selector, + only_text=kwargs.get("only_text", False), + image_description_min_word_threshold=kwargs.get( + "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD + ), + **extra_params, + ) + + # result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) if verbose: print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds") diff --git a/deploy/railway/README.md b/deploy/railway/README.md new file mode 100644 index 00000000..155e7642 --- /dev/null +++ b/deploy/railway/README.md @@ -0,0 +1,19 @@ +# Railway Deployment + +## Quick Deploy +[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/crawl4ai) + +## Manual Setup +1. Fork this repository +2. Create a new Railway project +3. Configure environment variables: + - `INSTALL_TYPE`: basic or all + - `ENABLE_GPU`: true/false +4. Deploy! + +## Configuration +See `railway.toml` for: +- Memory limits +- Health checks +- Restart policies +- Scaling options \ No newline at end of file diff --git a/deploy/railway/button.json b/deploy/railway/button.json new file mode 100644 index 00000000..1fc52167 --- /dev/null +++ b/deploy/railway/button.json @@ -0,0 +1,33 @@ +{ + "name": "Crawl4AI", + "description": "LLM Friendly Web Crawler & Scraper", + "render": { + "dockerfile": { + "path": "Dockerfile" + } + }, + "env": [ + { + "key": "INSTALL_TYPE", + "description": "Installation type (basic/all)", + "default": "basic", + "required": true + }, + { + "key": "ENABLE_GPU", + "description": "Enable GPU support", + "default": "false", + "required": false + } + ], + "services": [ + { + "name": "web", + "dockerfile": "./Dockerfile", + "healthcheck": { + "path": "/health", + "port": 11235 + } + } + ] + } \ No newline at end of file diff --git a/deploy/railway/railway.toml b/deploy/railway/railway.toml new file mode 100644 index 00000000..f24d8fab --- /dev/null +++ b/deploy/railway/railway.toml @@ -0,0 +1,18 @@ +# railway.toml +[build] +builder = "DOCKERFILE" +dockerfilePath = "Dockerfile" + +[deploy] +startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT" +healthcheckPath = "/health" +restartPolicyType = "ON_FAILURE" +restartPolicyMaxRetries = 3 + +[deploy.memory] +soft = 2048 # 2GB min for Playwright +hard = 4096 # 4GB max + +[deploy.scaling] +min = 1 +max = 1 diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml new file mode 100644 index 00000000..9bcfa982 --- /dev/null +++ b/docker-compose.hub.yml @@ -0,0 +1,27 @@ +services: + crawl4ai: + image: unclecode/crawl4ai:basic # Pull image from Docker Hub + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml new file mode 100644 index 00000000..7dc41b47 --- /dev/null +++ b/docker-compose.local.yml @@ -0,0 +1,33 @@ +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..1097ef11 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,62 @@ +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + profiles: ["local"] + ports: + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + crawl4ai-hub: + image: unclecode/crawl4ai:basic + profiles: ["hub"] + ports: + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index c22acd55..17ef9f04 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -7,12 +7,16 @@ import os from typing import Dict, Any class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: # Submit crawl job - response = requests.post(f"{self.base_url}/crawl", json=request_data) + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") task_id = response.json()["task_id"] print(f"Task ID: {task_id}") @@ -22,7 +26,7 @@ class Crawl4AiTester: if time.time() - start_time > timeout: raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") - result = requests.get(f"{self.base_url}/task/{task_id}") + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) status = result.json() if status["status"] == "failed": @@ -33,9 +37,30 @@ class Crawl4AiTester: return status time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() + + def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + """Directly crawl without using task queue""" + response = requests.post( + f"{self.base_url}/crawl_direct", + json=request_data, + headers=self.headers + ) + response.raise_for_status() + return response.json() def test_docker_deployment(version="basic"): - tester = Crawl4AiTester() + tester = Crawl4AiTester( + base_url="http://localhost:11235" , + # base_url="https://api.crawl4ai.com" # just for example + # api_token="test" # just for example + ) print(f"Testing Crawl4AI Docker {version} version") # Health check with timeout and retry @@ -53,7 +78,10 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - test_basic_crawl(tester) + # test_basic_crawl(tester) + # test_basic_crawl(tester) + # test_basic_crawl_sync(tester) + test_basic_crawl_direct(tester) # if version in ["full", "transformer"]: # test_cosine_extraction(tester) @@ -70,7 +98,8 @@ def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") request = { "urls": "https://www.nbcnews.com/business", - "priority": 10 + "priority": 10, + "session_id": "test" } result = tester.submit_and_wait(request) @@ -78,6 +107,34 @@ def test_basic_crawl(tester: Crawl4AiTester): assert result["result"]["success"] assert len(result["result"]["markdown"]) > 0 +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + +def test_basic_crawl_direct(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Direct) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + # "session_id": "test" + "cache_mode": "bypass" # or "enabled", "disabled", "read_only", "write_only" + } + + result = tester.crawl_direct(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + def test_js_execution(tester: Crawl4AiTester): print("\n=== Testing JS Execution ===") request = { diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9c57f57d..d67a8c30 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -71,12 +71,12 @@ async def use_proxy(): "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example." ) # Uncomment and modify the following lines to use a proxy - # async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: - # result = await crawler.arun( - # url="https://www.nbcnews.com/business", - # bypass_cache=True - # ) - # print(result.markdown[:500]) # Print first 500 characters + async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True + ) + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py new file mode 100644 index 00000000..362ae8fc --- /dev/null +++ b/docs/examples/v0.3.74.overview.py @@ -0,0 +1,277 @@ +import os, sys +# append the parent directory to the sys.path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +parent_parent_dir = os.path.dirname(parent_dir) +sys.path.append(parent_parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +__data__ = os.path.join(__location__, "__data") +import asyncio +from pathlib import Path +import aiohttp +import json +from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai.content_filter_strategy import BM25ContentFilter + +# 1. File Download Processing Example +async def download_example(): + """Example of downloading files from Python.org""" + # downloads_path = os.path.join(os.getcwd(), "downloads") + downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads") + os.makedirs(downloads_path, exist_ok=True) + + print(f"Downloads will be saved to: {downloads_path}") + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=downloads_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Find and click the first Windows installer link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + console.log('Found download link:', downloadLink.href); + downloadLink.click(); + } else { + console.log('No .exe download link found'); + } + """, + delay_before_return_html=1, # Wait 5 seconds to ensure download starts + cache_mode=CacheMode.BYPASS + ) + + if result.downloaded_files: + print("\nDownload successful!") + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB") + else: + print("\nNo files were downloaded") + +# 2. Local File and Raw HTML Processing Example +async def local_and_raw_html_example(): + """Example of processing local files and raw HTML""" + # Create a sample HTML file + sample_file = os.path.join(__data__, "sample.html") + with open(sample_file, "w") as f: + f.write(""" + +

Test Content

+

This is a test paragraph.

+ + """) + + async with AsyncWebCrawler(verbose=True) as crawler: + # Process local file + local_result = await crawler.arun( + url=f"file://{os.path.abspath(sample_file)}" + ) + + # Process raw HTML + raw_html = """ + +

Raw HTML Test

+

This is a test of raw HTML processing.

+ + """ + raw_result = await crawler.arun( + url=f"raw:{raw_html}" + ) + + # Clean up + os.remove(sample_file) + + print("Local file content:", local_result.markdown) + print("\nRaw HTML content:", raw_result.markdown) + +# 3. Enhanced Markdown Generation Example +async def markdown_generation_example(): + """Example of enhanced markdown generation with citations and LLM-friendly features""" + async with AsyncWebCrawler(verbose=True) as crawler: + # Create a content filter (optional) + content_filter = BM25ContentFilter( + # user_query="History and cultivation", + bm25_threshold=1.0 + ) + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=content_filter, + cache_mode=CacheMode.BYPASS + ) + + from crawl4ai import AsyncWebCrawler + from crawl4ai.content_filter_strategy import BM25ContentFilter + + result = await crawler.arun( + url="https://en.wikipedia.org/wiki/Apple", + css_selector="main div#bodyContent", + content_filter=BM25ContentFilter() + ) + print(result.markdown_v2.fit_markdown) + + print("\nMarkdown Generation Results:") + print(f"1. Original markdown length: {len(result.markdown)}") + print(f"2. New markdown versions (markdown_v2):") + print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}") + print(f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}") + print(f" - References section length: {len(result.markdown_v2.references_markdown)}") + if result.markdown_v2.fit_markdown: + print(f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}") + + # Save examples to files + output_dir = os.path.join(__data__, "markdown_examples") + os.makedirs(output_dir, exist_ok=True) + + # Save different versions + with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f: + f.write(result.markdown_v2.raw_markdown) + + with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f: + f.write(result.markdown_v2.markdown_with_citations) + + with open(os.path.join(output_dir, "3_references.md"), "w") as f: + f.write(result.markdown_v2.references_markdown) + + if result.markdown_v2.fit_markdown: + with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f: + f.write(result.markdown_v2.fit_markdown) + + print(f"\nMarkdown examples saved to: {output_dir}") + + # Show a sample of citations and references + print("\nSample of markdown with citations:") + print(result.markdown_v2.markdown_with_citations[:500] + "...\n") + print("Sample of references:") + print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...") + +# 4. Browser Management Example +async def browser_management_example(): + """Example of using enhanced browser management features""" + # Use the specified user directory path + user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile") + os.makedirs(user_data_dir, exist_ok=True) + + print(f"Browser profile will be saved to: {user_data_dir}") + + async with AsyncWebCrawler( + use_managed_browser=True, + user_data_dir=user_data_dir, + headless=False, + verbose=True + ) as crawler: + + result = await crawler.arun( + url="https://crawl4ai.com", + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS + ) + # Use GitHub as an example - it's a good test for browser management + # because it requires proper browser handling + result = await crawler.arun( + url="https://github.com/trending", + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS + ) + + print("\nBrowser session result:", result.success) + if result.success: + print("Page title:", result.metadata.get('title', 'No title found')) + +# 5. API Usage Example +async def api_example(): + """Example of using the new API endpoints""" + api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" + headers = {'Authorization': f'Bearer {api_token}'} + async with aiohttp.ClientSession() as session: + # Submit crawl job + crawl_request = { + "urls": ["https://news.ycombinator.com"], # Hacker News as an example + "extraction_config": { + "type": "json_css", + "params": { + "schema": { + "name": "Hacker News Articles", + "baseSelector": ".athing", + "fields": [ + { + "name": "title", + "selector": ".title a", + "type": "text" + }, + { + "name": "score", + "selector": ".score", + "type": "text" + }, + { + "name": "url", + "selector": ".title a", + "type": "attribute", + "attribute": "href" + } + ] + } + } + }, + "crawler_params": { + "headless": True, + # "use_managed_browser": True + }, + "cache_mode": "bypass", + # "screenshot": True, + # "magic": True + } + + async with session.post( + "http://localhost:11235/crawl", + json=crawl_request, + headers=headers + ) as response: + task_data = await response.json() + task_id = task_data["task_id"] + + # Check task status + while True: + async with session.get( + f"http://localhost:11235/task/{task_id}", + headers=headers + ) as status_response: + result = await status_response.json() + print(f"Task status: {result['status']}") + + if result["status"] == "completed": + print("Task completed!") + print("Results:") + news = json.loads(result["results"][0]['extracted_content']) + print(json.dumps(news[:4], indent=2)) + break + else: + await asyncio.sleep(1) + +# Main execution +async def main(): + # print("Running Crawl4AI feature examples...") + + # print("\n1. Running Download Example:") + # await download_example() + + # print("\n2. Running Markdown Generation Example:") + # await markdown_generation_example() + + # # print("\n3. Running Local and Raw HTML Example:") + # await local_and_raw_html_example() + + # # print("\n4. Running Browser Management Example:") + await browser_management_example() + + # print("\n5. Running API Example:") + await api_example() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/md_v2/advanced/managed_browser.md b/docs/md_v2/advanced/managed_browser.md new file mode 100644 index 00000000..80d6fc1a --- /dev/null +++ b/docs/md_v2/advanced/managed_browser.md @@ -0,0 +1,84 @@ +# Content Filtering in Crawl4AI + +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. + +## Relevance Content Filter + +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + +## BM25 Algorithm + +The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. + +### Usage + +To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query=None): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + if result.success: + print(f"Filtered Content (JSON):\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object + print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. + else: + print("Error:", result.error_message) + +# Example usage: +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. + +``` + +### Parameters + +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. +- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. + + +## Fit Markdown Flag + +Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. + + +## Custom Content Filtering Strategies + +You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. + +```python +from crawl4ai.content_filter_strategy import RelevantContentFilter +from bs4 import BeautifulSoup, Tag +from typing import List + +class MyCustomFilter(RelevantContentFilter): + def filter_content(self, html: str) -> List[str]: + soup = BeautifulSoup(html, 'lxml') + # Implement custom filtering logic here + # Example: extract all paragraphs within divs with class "article-body" + filtered_paragraphs = [] + for tag in soup.select("div.article-body p"): + if isinstance(tag, Tag): + filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. + return filtered_paragraphs + + + +async def custom_filter_demo(url: str): + async with AsyncWebCrawler() as crawler: + custom_filter = MyCustomFilter() + result = await crawler.arun(url, extraction_strategy=custom_filter) + if result.success: + print(result.extracted_content) + +``` + +This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. + +## Conclusion + +Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. diff --git a/docs/md_v2/advanced/session-management-advanced.md b/docs/md_v2/advanced/session-management-advanced.md index f8c81da2..908828f7 100644 --- a/docs/md_v2/advanced/session-management-advanced.md +++ b/docs/md_v2/advanced/session-management-advanced.md @@ -30,7 +30,7 @@ Let's start with a basic example of session-based crawling: ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def basic_session_crawl(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -43,7 +43,7 @@ async def basic_session_crawl(): session_id=session_id, js_code="document.querySelector('.load-more-button').click();" if page > 0 else None, css_selector=".content-item", - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items") @@ -102,7 +102,7 @@ async def advanced_session_crawl_with_hooks(): session_id=session_id, css_selector="li.commit-item", js_code=js_next_page if page > 0 else None, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, js_only=page > 0 ) @@ -174,7 +174,7 @@ async def integrated_js_and_wait_crawl(): extraction_strategy=extraction_strategy, js_code=js_next_page_and_wait if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) commits = json.loads(result.extracted_content) @@ -241,7 +241,7 @@ async def wait_for_parameter_crawl(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) commits = json.loads(result.extracted_content) diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index c38ed852..eae4cf7b 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -75,7 +75,7 @@ async def crawl_dynamic_content(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) if result.success: diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md index 9ef73aef..509991e5 100644 --- a/docs/md_v2/api/arun.md +++ b/docs/md_v2/api/arun.md @@ -8,11 +8,26 @@ The following parameters can be passed to the `arun()` method. They are organize await crawler.arun( url="https://example.com", # Required: URL to crawl verbose=True, # Enable detailed logging - bypass_cache=False, # Skip cache for this request + cache_mode=CacheMode.ENABLED, # Control cache behavior warmup=True # Whether to run warmup check ) ``` +## Cache Control + +```python +from crawl4ai import CacheMode + +await crawler.arun( + cache_mode=CacheMode.ENABLED, # Normal caching (read/write) + # Other cache modes: + # cache_mode=CacheMode.DISABLED # No caching at all + # cache_mode=CacheMode.READ_ONLY # Only read from cache + # cache_mode=CacheMode.WRITE_ONLY # Only write to cache + # cache_mode=CacheMode.BYPASS # Skip cache for this operation +) +``` + ## Content Processing Parameters ### Text Processing @@ -162,14 +177,13 @@ await crawler.arun( ## Parameter Interactions and Notes -1. **Magic Mode Combinations** +1. **Cache and Performance Setup** ```python - # Full anti-detection setup + # Optimal caching for repeated crawls await crawler.arun( - magic=True, - headless=False, - simulate_user=True, - override_navigator=True + cache_mode=CacheMode.ENABLED, + word_count_threshold=10, + process_iframes=False ) ``` @@ -179,7 +193,8 @@ await crawler.arun( await crawler.arun( js_code="window.scrollTo(0, document.body.scrollHeight);", wait_for="css:.lazy-content", - delay_before_return_html=2.0 + delay_before_return_html=2.0, + cache_mode=CacheMode.WRITE_ONLY # Cache results after dynamic load ) ``` @@ -192,7 +207,8 @@ await crawler.arun( extraction_strategy=my_strategy, chunking_strategy=my_chunking, process_iframes=True, - remove_overlay_elements=True + remove_overlay_elements=True, + cache_mode=CacheMode.ENABLED ) ``` @@ -201,7 +217,7 @@ await crawler.arun( 1. **Performance Optimization** ```python await crawler.arun( - bypass_cache=False, # Use cache when possible + cache_mode=CacheMode.ENABLED, # Use full caching word_count_threshold=10, # Filter out noise process_iframes=False # Skip iframes if not needed ) @@ -212,7 +228,8 @@ await crawler.arun( await crawler.arun( magic=True, # Enable anti-detection delay_before_return_html=1.0, # Wait for dynamic content - page_timeout=60000 # Longer timeout for slow pages + page_timeout=60000, # Longer timeout for slow pages + cache_mode=CacheMode.WRITE_ONLY # Cache results after successful crawl ) ``` @@ -221,6 +238,7 @@ await crawler.arun( await crawler.arun( remove_overlay_elements=True, # Remove popups excluded_tags=['nav', 'aside'],# Remove unnecessary elements - keep_data_attributes=False # Remove data attributes + keep_data_attributes=False, # Remove data attributes + cache_mode=CacheMode.ENABLED # Use cache for faster processing ) ``` \ No newline at end of file diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md index 06998af3..7e3bda98 100644 --- a/docs/md_v2/api/crawl-result.md +++ b/docs/md_v2/api/crawl-result.md @@ -20,6 +20,7 @@ class CrawlResult(BaseModel): fit_html: Optional[str] = None # Most relevant HTML content markdown: Optional[str] = None # HTML converted to markdown fit_markdown: Optional[str] = None # Most relevant markdown content + downloaded_files: Optional[List[str]] = None # Downloaded files # Extracted Data extracted_content: Optional[str] = None # Content from extraction strategy diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 6c7960d2..c1c4d2ea 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -32,4 +32,5 @@ | async_webcrawler.py | warmup | `kwargs.get("warmup", True)` | AsyncWebCrawler | Initialize crawler with warmup request | | async_webcrawler.py | session_id | `kwargs.get("session_id", None)` | AsyncWebCrawler | Session identifier for browser reuse | | async_webcrawler.py | only_text | `kwargs.get("only_text", False)` | AsyncWebCrawler | Extract only text content | -| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | \ No newline at end of file +| async_webcrawler.py | bypass_cache | `kwargs.get("bypass_cache", False)` | AsyncWebCrawler | Skip cache and force fresh crawl | +| async_webcrawler.py | cache_mode | `kwargs.get("cache_mode", CacheMode.ENABLE)` | AsyncWebCrawler | Cache handling mode for request | \ No newline at end of file diff --git a/docs/md_v2/basic/cache-modes.md b/docs/md_v2/basic/cache-modes.md new file mode 100644 index 00000000..04a4f218 --- /dev/null +++ b/docs/md_v2/basic/cache-modes.md @@ -0,0 +1,79 @@ +# Crawl4AI Cache System and Migration Guide + +## Overview +Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. + +## Old vs New Approach + +### Old Way (Deprecated) +The old system used multiple boolean flags: +- `bypass_cache`: Skip cache entirely +- `disable_cache`: Disable all caching +- `no_cache_read`: Don't read from cache +- `no_cache_write`: Don't write to cache + +### New Way (Recommended) +The new system uses a single `CacheMode` enum: +- `CacheMode.ENABLED`: Normal caching (read/write) +- `CacheMode.DISABLED`: No caching at all +- `CacheMode.READ_ONLY`: Only read from cache +- `CacheMode.WRITE_ONLY`: Only write to cache +- `CacheMode.BYPASS`: Skip cache for this operation + +## Migration Example + +### Old Code (Deprecated) +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + bypass_cache=True # Old way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### New Code (Recommended) +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CacheMode # Import CacheMode + +async def use_proxy(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + cache_mode=CacheMode.BYPASS # New way + ) + print(len(result.markdown)) + +async def main(): + await use_proxy() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Common Migration Patterns + +Old Flag | New Mode +---------|---------- +`bypass_cache=True` | `cache_mode=CacheMode.BYPASS` +`disable_cache=True` | `cache_mode=CacheMode.DISABLED` +`no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` +`no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` + +## Suppressing Deprecation Warnings +If you need time to migrate, you can temporarily suppress deprecation warnings: +```python +# In your config.py +SHOW_DEPRECATION_WARNINGS = False +``` diff --git a/docs/md_v2/basic/content_filtering.md b/docs/md_v2/basic/content_filtering.md new file mode 100644 index 00000000..9506c075 --- /dev/null +++ b/docs/md_v2/basic/content_filtering.md @@ -0,0 +1,84 @@ +# Content Filtering in Crawl4AI + +This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies. + +## Relevance Content Filter + +The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks. + +## BM25 Algorithm + +The `BM25ContentFilter` uses the BM25 algorithm, a ranking function used in information retrieval to estimate the relevance of documents to a given search query. In Crawl4AI, this algorithm helps to identify and extract text chunks that are most relevant to the page's metadata or a user-specified query. + +### Usage + +To use the `BM25ContentFilter`, initialize it and then pass it as the `extraction_strategy` parameter to the `arun` method of the crawler. + +```python +from crawl4ai import AsyncWebCrawler +from crawl4ai.content_filter_strategy import BM25ContentFilter + +async def filter_content(url, query=None): + async with AsyncWebCrawler() as crawler: + content_filter = BM25ContentFilter(user_query=query) + result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering + if result.success: + print(f"Filtered Content (JSON):\n{result.extracted_content}") + print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object + print(f"\nFiltered HTML:\n{result.fit_html}") # New field in CrawlResult object. Note that raw HTML may have tags re-organized due to internal parsing. + else: + print("Error:", result.error_message) + +# Example usage: +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple", "fruit nutrition health")) # with query +asyncio.run(filter_content("https://en.wikipedia.org/wiki/Apple")) # without query, metadata will be used as the query. + +``` + +### Parameters + +- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts relevant metadata (title, description, keywords) from the page and uses that as the query. +- **`bm25_threshold`**: (Optional, default 1.0) A float value that controls the threshold for relevance. Higher values result in stricter filtering, returning only the most relevant text chunks. Lower values result in more lenient filtering. + + +## Fit Markdown Flag + +Setting the `fit_markdown` flag to `True` in the `arun` method activates the BM25 content filtering during the crawl. The `fit_markdown` parameter instructs the scraper to extract and clean the HTML, primarily to prepare for a Large Language Model that cannot process large amounts of data. Setting this flag not only improves the quality of the extracted content but also adds the filtered content to two new attributes in the returned `CrawlResult` object: `fit_markdown` and `fit_html`. + + +## Custom Content Filtering Strategies + +You can create your own custom filtering strategies by inheriting from the `RelevantContentFilter` class and implementing the `filter_content` method. This allows you to tailor the filtering logic to your specific needs. + +```python +from crawl4ai.content_filter_strategy import RelevantContentFilter +from bs4 import BeautifulSoup, Tag +from typing import List + +class MyCustomFilter(RelevantContentFilter): + def filter_content(self, html: str) -> List[str]: + soup = BeautifulSoup(html, 'lxml') + # Implement custom filtering logic here + # Example: extract all paragraphs within divs with class "article-body" + filtered_paragraphs = [] + for tag in soup.select("div.article-body p"): + if isinstance(tag, Tag): + filtered_paragraphs.append(str(tag)) # Add the cleaned HTML element. + return filtered_paragraphs + + + +async def custom_filter_demo(url: str): + async with AsyncWebCrawler() as crawler: + custom_filter = MyCustomFilter() + result = await crawler.arun(url, content_filter=custom_filter) + if result.success: + print(result.extracted_content) + +``` + +This example demonstrates extracting paragraphs from a specific div class. You can customize this logic to implement different filtering strategies, use regular expressions, analyze text density, or apply other relevant techniques. + +## Conclusion + +Content filtering strategies provide a powerful way to refine the output of your crawls. By using `BM25ContentFilter` or creating custom strategies, you can focus on the most pertinent information and improve the efficiency of your data processing pipeline. diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index cc11d0d9..87e468aa 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -7,66 +7,325 @@ Crawl4AI provides official Docker images for easy deployment and scalability. Th Pull and run the basic version: ```bash +# Basic run without security docker pull unclecode/crawl4ai:basic docker run -p 11235:11235 unclecode/crawl4ai:basic + +# Run with API security enabled +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic ``` -Test the deployment: +## Running with Docker Compose 🐳 + +### Use Docker Compose (From Local Dockerfile or Docker Hub) + +Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub. + +### **Option 1: Using Docker Compose to Build Locally** +If you want to build the image locally, use the provided `docker-compose.local.yml` file. + +```bash +docker-compose -f docker-compose.local.yml up -d +``` + +This will: +1. Build the Docker image from the provided `Dockerfile`. +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Option 2: Using Docker Compose with Pre-Built Image from Hub** +If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file. + +```bash +docker-compose -f docker-compose.hub.yml up -d +``` + +This will: +1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration). +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Stopping the Running Services** + +To stop the services started via Docker Compose, you can use: + +```bash +docker-compose -f docker-compose.local.yml down +# OR +docker-compose -f docker-compose.hub.yml down +``` + +If the containers don’t stop and the application is still running, check the running containers: + +```bash +docker ps +``` + +Find the `CONTAINER ID` of the running service and stop it forcefully: + +```bash +docker stop +``` + +--- + +### **Debugging with Docker Compose** + +- **Check Logs**: To view the container logs: + ```bash + docker-compose -f docker-compose.local.yml logs -f + ``` + +- **Remove Orphaned Containers**: If the service is still running unexpectedly: + ```bash + docker-compose -f docker-compose.local.yml down --remove-orphans + ``` + +- **Manually Remove Network**: If the network is still in use: + ```bash + docker network ls + docker network rm crawl4ai_default + ``` + +--- + +### Why Use Docker Compose? + +Docker Compose is the recommended way to deploy Crawl4AI because: +1. It simplifies multi-container setups. +2. Allows you to define environment variables, resources, and ports in a single file. +3. Makes it easier to switch between local development and production-ready images. + +For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent. + + + + +## API Security 🔒 + +### Understanding CRAWL4AI_API_TOKEN + +The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance: + +- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication +- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible + +```bash +# Secured Instance +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all + +# Unsecured Instance +docker run -p 11235:11235 unclecode/crawl4ai:all +``` + +### Making API Calls + +For secured instances, include the token in all requests: + ```python import requests -# Test health endpoint -health = requests.get("http://localhost:11235/health") -print("Health check:", health.json()) +# Setup headers if token is being used +api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN +headers = {"Authorization": f"Bearer {api_token}"} if api_token else {} -# Test basic crawl +# Making authenticated requests response = requests.post( "http://localhost:11235/crawl", + headers=headers, json={ - "urls": "https://www.nbcnews.com/business", + "urls": "https://example.com", "priority": 10 } ) + +# Checking task status task_id = response.json()["task_id"] -print("Task ID:", task_id) +status = requests.get( + f"http://localhost:11235/task/{task_id}", + headers=headers +) ``` -## Available Images 🏷️ +### Using with Docker Compose -- `unclecode/crawl4ai:basic` - Basic web crawling capabilities -- `unclecode/crawl4ai:all` - Full installation with all features -- `unclecode/crawl4ai:gpu` - GPU-enabled version for ML features +In your `docker-compose.yml`: +```yaml +services: + crawl4ai: + image: unclecode/crawl4ai:all + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional + # ... other configuration +``` + +Then either: +1. Set in `.env` file: +```env +CRAWL4AI_API_TOKEN=your_secret_token +``` + +2. Or set via command line: +```bash +CRAWL4AI_API_TOKEN=your_secret_token docker-compose up +``` + +> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`). ## Configuration Options 🔧 ### Environment Variables +You can configure the service using environment variables: + ```bash +# Basic configuration docker run -p 11235:11235 \ -e MAX_CONCURRENT_TASKS=5 \ - -e OPENAI_API_KEY=your_key \ unclecode/crawl4ai:all -``` -### Volume Mounting - -Mount a directory for persistent data: -```bash +# With security and LLM support docker run -p 11235:11235 \ - -v $(pwd)/data:/app/data \ + -e CRAWL4AI_API_TOKEN=your_secret_token \ + -e OPENAI_API_KEY=sk-... \ + -e ANTHROPIC_API_KEY=sk-ant-... \ unclecode/crawl4ai:all ``` -### Resource Limits +### Using Docker Compose (Recommended) 🐳 -Control container resources: +Create a `docker-compose.yml`: + +```yaml +version: '3.8' + +services: + crawl4ai: + image: unclecode/crawl4ai:all + ports: + - "11235:11235" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security + - MAX_CONCURRENT_TASKS=5 + # LLM Provider Keys + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G +``` + +You can run it in two ways: + +1. Using environment variables directly: ```bash -docker run -p 11235:11235 \ - --memory=4g \ - --cpus=2 \ - unclecode/crawl4ai:all +CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up ``` +2. Using a `.env` file (recommended): +Create a `.env` file in the same directory: +```env +# API Security (optional) +CRAWL4AI_API_TOKEN=your_secret_token + +# LLM Provider Keys +OPENAI_API_KEY=sk-... +ANTHROPIC_API_KEY=sk-ant-... + +# Other Configuration +MAX_CONCURRENT_TASKS=5 +``` + +Then simply run: +```bash +docker-compose up +``` + +### Testing the Deployment 🧪 + +```python +import requests + +# For unsecured instances +def test_unsecured(): + # Health check + health = requests.get("http://localhost:11235/health") + print("Health check:", health.json()) + + # Basic crawl + response = requests.post( + "http://localhost:11235/crawl", + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) + +# For secured instances +def test_secured(api_token): + headers = {"Authorization": f"Bearer {api_token}"} + + # Basic crawl with authentication + response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) +``` + +### LLM Extraction Example 🤖 + +When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction: + +```python +request = { + "urls": "https://example.com", + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4", + "instruction": "Extract main topics from the page" + } + } +} + +# Make the request (add headers if using API security) +response = requests.post("http://localhost:11235/crawl", json=request) +``` + +> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! + + + + + + + + + + + + + + + + + + ## Usage Examples 📝 ### Basic Crawling diff --git a/docs/md_v2/basic/file-download.md b/docs/md_v2/basic/file-download.md new file mode 100644 index 00000000..c37e8812 --- /dev/null +++ b/docs/md_v2/basic/file-download.md @@ -0,0 +1,148 @@ +# Download Handling in Crawl4AI + +This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. + +## Enabling Downloads + +By default, Crawl4AI does not download files. To enable downloads, set the `accept_downloads` parameter to `True` in either the `AsyncWebCrawler` constructor or the `arun` method. + +```python +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler(accept_downloads=True) as crawler: # Globally enable downloads + # ... your crawling logic ... + +asyncio.run(main()) +``` + +Or, enable it for a specific crawl: + +```python +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="...", accept_downloads=True) + # ... +``` + +## Specifying Download Location + +You can specify the download directory using the `downloads_path` parameter. If not provided, Crawl4AI creates a "downloads" directory inside the `.crawl4ai` folder in your home directory. + +```python +import os +from pathlib import Path + +# ... inside your crawl function: + +downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path +os.makedirs(downloads_path, exist_ok=True) + +result = await crawler.arun(url="...", downloads_path=downloads_path, accept_downloads=True) + +# ... +``` + +If you are setting it globally, provide the path to the AsyncWebCrawler: +```python +async def crawl_with_downloads(url: str, download_path: str): + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=download_path, # or set it on arun + verbose=True + ) as crawler: + result = await crawler.arun(url=url) # you still need to enable downloads per call. + # ... +``` + + + +## Triggering Downloads + +Downloads are typically triggered by user interactions on a web page (e.g., clicking a download button). You can simulate these actions with the `js_code` parameter, injecting JavaScript code to be executed within the browser context. The `wait_for` parameter might also be crucial to allowing sufficient time for downloads to initiate before the crawler proceeds. + +```python +result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Find and click the first Windows installer link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) { + downloadLink.click(); + } + """, + wait_for=5 # Wait for 5 seconds for the download to start +) +``` + +## Accessing Downloaded Files + +Downloaded file paths are stored in the `downloaded_files` attribute of the returned `CrawlResult` object. This is a list of strings, with each string representing the absolute path to a downloaded file. + +```python +if result.downloaded_files: + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path}") + # Perform operations with downloaded files, e.g., check file size + file_size = os.path.getsize(file_path) + print(f"- File size: {file_size} bytes") +else: + print("No files downloaded.") +``` + + +## Example: Downloading Multiple Files + +```python +import asyncio +import os +from pathlib import Path +from crawl4ai import AsyncWebCrawler + +async def download_multiple_files(url: str, download_path: str): + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=download_path, + verbose=True + ) as crawler: + result = await crawler.arun( + url=url, + js_code=""" + // Trigger multiple downloads (example) + const downloadLinks = document.querySelectorAll('a[download]'); // Or a more specific selector + for (const link of downloadLinks) { + link.click(); + await new Promise(r => setTimeout(r, 2000)); // Add a small delay between clicks if needed + } + """, + wait_for=10 # Adjust the timeout to match the expected time for all downloads to start + ) + + if result.downloaded_files: + print("Downloaded files:") + for file in result.downloaded_files: + print(f"- {file}") + else: + print("No files downloaded.") + + +# Example usage +download_path = os.path.join(Path.home(), ".crawl4ai", "downloads") +os.makedirs(download_path, exist_ok=True) # Create directory if it doesn't exist + + +asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path)) +``` + +## Important Considerations + +- **Browser Context:** Downloads are managed within the browser context. Ensure your `js_code` correctly targets the download triggers on the specific web page. +- **Waiting:** Use `wait_for` to manage the timing of the crawl process if immediate download might not occur. +- **Error Handling:** Implement proper error handling to gracefully manage failed downloads or incorrect file paths. +- **Security:** Downloaded files should be scanned for potential security threats before use. + + + +This guide provides a foundation for handling downloads with Crawl4AI. You can adapt these techniques to manage downloads in various scenarios and integrate them into more complex crawling workflows. diff --git a/docs/md_v2/basic/installation.md b/docs/md_v2/basic/installation.md index a4a60857..de8aeafa 100644 --- a/docs/md_v2/basic/installation.md +++ b/docs/md_v2/basic/installation.md @@ -58,6 +58,51 @@ crawl4ai-download-models This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. +## Playwright Installation Note for Ubuntu + +If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies: + +```bash +sudo apt-get install -y \ + libwoff1 \ + libopus0 \ + libwebp7 \ + libwebpdemux2 \ + libenchant-2-2 \ + libgudev-1.0-0 \ + libsecret-1-0 \ + libhyphen0 \ + libgdk-pixbuf2.0-0 \ + libegl1 \ + libnotify4 \ + libxslt1.1 \ + libevent-2.1-7 \ + libgles2 \ + libxcomposite1 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libepoxy0 \ + libgtk-3-0 \ + libharfbuzz-icu0 \ + libgstreamer-gl1.0-0 \ + libgstreamer-plugins-bad1.0-0 \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + libxt6 \ + libxaw7 \ + xvfb \ + fonts-noto-color-emoji \ + libfontconfig \ + libfreetype6 \ + xfonts-cyrillic \ + xfonts-scalable \ + fonts-liberation \ + fonts-ipafont-gothic \ + fonts-wqy-zenhei \ + fonts-tlwg-loma-otf \ + fonts-freefont-ttf +``` + ## Option 2: Using Docker (Coming Soon) Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. diff --git a/docs/md_v2/basic/prefix-based-input.md b/docs/md_v2/basic/prefix-based-input.md new file mode 100644 index 00000000..42987a67 --- /dev/null +++ b/docs/md_v2/basic/prefix-based-input.md @@ -0,0 +1,235 @@ +# Prefix-Based Input Handling in Crawl4AI + +This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example. + +## Table of Contents +- [Prefix-Based Input Handling in Crawl4AI](#prefix-based-input-handling-in-crawl4ai) + - [Table of Contents](#table-of-contents) + - [Crawling a Web URL](#crawling-a-web-url) + - [Crawling a Local HTML File](#crawling-a-local-html-file) + - [Crawling Raw HTML Content](#crawling-raw-html-content) + - [Complete Example](#complete-example) + - [**How It Works**](#how-it-works) + - [**Running the Example**](#running-the-example) + - [Conclusion](#conclusion) + +--- + + +### Crawling a Web URL + +To crawl a live web page, provide the URL starting with `http://` or `https://`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_web(): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", bypass_cache=True) + if result.success: + print("Markdown Content:") + print(result.markdown) + else: + print(f"Failed to crawl: {result.error_message}") + +asyncio.run(crawl_web()) +``` + +### Crawling a Local HTML File + +To crawl a local HTML file, prefix the file path with `file://`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_local_file(): + local_file_path = "/path/to/apple.html" # Replace with your file path + file_url = f"file://{local_file_path}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url=file_url, bypass_cache=True) + if result.success: + print("Markdown Content from Local File:") + print(result.markdown) + else: + print(f"Failed to crawl local file: {result.error_message}") + +asyncio.run(crawl_local_file()) +``` + +### Crawling Raw HTML Content + +To crawl raw HTML content, prefix the HTML string with `raw:`. + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def crawl_raw_html(): + raw_html = "

Hello, World!

" + raw_html_url = f"raw:{raw_html}" + + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun(url=raw_html_url, bypass_cache=True) + if result.success: + print("Markdown Content from Raw HTML:") + print(result.markdown) + else: + print(f"Failed to crawl raw HTML: {result.error_message}") + +asyncio.run(crawl_raw_html()) +``` + +--- + +## Complete Example + +Below is a comprehensive script that: +1. **Crawls the Wikipedia page for "Apple".** +2. **Saves the HTML content to a local file (`apple.html`).** +3. **Crawls the local HTML file and verifies the markdown length matches the original crawl.** +4. **Crawls the raw HTML content from the saved file and verifies consistency.** + +```python +import os +import sys +import asyncio +from pathlib import Path + +# Adjust the parent directory to include the crawl4ai module +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai import AsyncWebCrawler + +async def main(): + # Define the URL to crawl + wikipedia_url = "https://en.wikipedia.org/wiki/apple" + + # Define the path to save the HTML file + # Save the file in the same directory as the script + script_dir = Path(__file__).parent + html_file_path = script_dir / "apple.html" + + async with AsyncWebCrawler(verbose=True) as crawler: + print("\n=== Step 1: Crawling the Wikipedia URL ===") + # Crawl the Wikipedia URL + result = await crawler.arun(url=wikipedia_url, bypass_cache=True) + + # Check if crawling was successful + if not result.success: + print(f"Failed to crawl {wikipedia_url}: {result.error_message}") + return + + # Save the HTML content to a local file + with open(html_file_path, 'w', encoding='utf-8') as f: + f.write(result.html) + print(f"Saved HTML content to {html_file_path}") + + # Store the length of the generated markdown + web_crawl_length = len(result.markdown) + print(f"Length of markdown from web crawl: {web_crawl_length}\n") + + print("=== Step 2: Crawling from the Local HTML File ===") + # Construct the file URL with 'file://' prefix + file_url = f"file://{html_file_path.resolve()}" + + # Crawl the local HTML file + local_result = await crawler.arun(url=file_url, bypass_cache=True) + + # Check if crawling was successful + if not local_result.success: + print(f"Failed to crawl local file {file_url}: {local_result.error_message}") + return + + # Store the length of the generated markdown from local file + local_crawl_length = len(local_result.markdown) + print(f"Length of markdown from local file crawl: {local_crawl_length}") + + # Compare the lengths + assert web_crawl_length == local_crawl_length, ( + f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Local file crawl ({local_crawl_length})" + ) + print("✅ Markdown length matches between web crawl and local file crawl.\n") + + print("=== Step 3: Crawling Using Raw HTML Content ===") + # Read the HTML content from the saved file + with open(html_file_path, 'r', encoding='utf-8') as f: + raw_html_content = f.read() + + # Prefix the raw HTML content with 'raw:' + raw_html_url = f"raw:{raw_html_content}" + + # Crawl using the raw HTML content + raw_result = await crawler.arun(url=raw_html_url, bypass_cache=True) + + # Check if crawling was successful + if not raw_result.success: + print(f"Failed to crawl raw HTML content: {raw_result.error_message}") + return + + # Store the length of the generated markdown from raw HTML + raw_crawl_length = len(raw_result.markdown) + print(f"Length of markdown from raw HTML crawl: {raw_crawl_length}") + + # Compare the lengths + assert web_crawl_length == raw_crawl_length, ( + f"Markdown length mismatch: Web crawl ({web_crawl_length}) != Raw HTML crawl ({raw_crawl_length})" + ) + print("✅ Markdown length matches between web crawl and raw HTML crawl.\n") + + print("All tests passed successfully!") + + # Clean up by removing the saved HTML file + if html_file_path.exists(): + os.remove(html_file_path) + print(f"Removed the saved HTML file: {html_file_path}") + +# Run the main function +if __name__ == "__main__": + asyncio.run(main()) +``` + +### **How It Works** + +1. **Step 1: Crawl the Web URL** + - Crawls `https://en.wikipedia.org/wiki/apple`. + - Saves the HTML content to `apple.html`. + - Records the length of the generated markdown. + +2. **Step 2: Crawl from the Local HTML File** + - Uses the `file://` prefix to crawl `apple.html`. + - Ensures the markdown length matches the original web crawl. + +3. **Step 3: Crawl Using Raw HTML Content** + - Reads the HTML from `apple.html`. + - Prefixes it with `raw:` and crawls. + - Verifies the markdown length matches the previous results. + +4. **Cleanup** + - Deletes the `apple.html` file after testing. + +### **Running the Example** + +1. **Save the Script:** + - Save the above code as `test_crawl4ai.py` in your project directory. + +2. **Execute the Script:** + - Run the script using: + ```bash + python test_crawl4ai.py + ``` + +3. **Observe the Output:** + - The script will print logs detailing each step. + - Assertions ensure consistency across different crawling methods. + - Upon success, it confirms that all markdown lengths match. + +--- + +## Conclusion + +With the new prefix-based input handling in **Crawl4AI**, you can effortlessly crawl web URLs, local HTML files, and raw HTML strings using a unified `url` parameter. This enhancement simplifies the API usage and provides greater flexibility for diverse crawling scenarios. + diff --git a/docs/md_v2/basic/quickstart.md b/docs/md_v2/basic/quickstart.md index f4904915..95b8a397 100644 --- a/docs/md_v2/basic/quickstart.md +++ b/docs/md_v2/basic/quickstart.md @@ -8,7 +8,7 @@ First, let's import the necessary modules and create an instance of `AsyncWebCra ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CasheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -42,7 +42,7 @@ async def capture_and_save_screenshot(url: str, output_path: str): result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) if result.success and result.screenshot: @@ -62,15 +62,15 @@ Crawl4AI supports multiple browser engines. Here's how to use different browsers ```python # Use Firefox async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) # Use WebKit async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) # Use Chromium (default) async with AsyncWebCrawler(verbose=True, headless=True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode=CacheMode.BYPASS) ``` ### User Simulation 🎭 @@ -81,7 +81,7 @@ Simulate real user behavior to avoid detection: async with AsyncWebCrawler(verbose=True, headless=True) as crawler: result = await crawler.arun( url="YOUR-URL-HERE", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, simulate_user=True, # Causes random mouse movements and clicks override_navigator=True # Makes the browser appear more like a real user ) @@ -99,7 +99,7 @@ async def main(): print(f"First crawl result: {result1.markdown[:100]}...") # Force to crawl again - result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True) + result2 = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS) print(f"Second crawl result: {result2.markdown[:100]}...") asyncio.run(main()) @@ -189,7 +189,7 @@ extraction_strategy = LLMExtractionStrategy( async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://paulgraham.com/love.html", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, extraction_strategy=extraction_strategy ) ``` @@ -239,7 +239,7 @@ async def crawl_dynamic_content(): js_code=js_next_page if page > 0 else None, wait_for=wait_for if page > 0 else None, js_only=page > 0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, headless=False, ) @@ -254,7 +254,7 @@ Remove overlay elements and fit content appropriately: async with AsyncWebCrawler(headless=False) as crawler: result = await crawler.arun( url="your-url-here", - bypass_cache=True, + cache_mode=CacheMode.BYPASS, word_count_threshold=10, remove_overlay_elements=True, screenshot=True @@ -282,7 +282,7 @@ async with AsyncWebCrawler() as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", word_count_threshold=0, - bypass_cache=True, + cache_mode=CacheMode.BYPASS, verbose=False, ) end = time.time() diff --git a/docs/md_v2/basic/simple-crawling.md b/docs/md_v2/basic/simple-crawling.md index 097d5e61..871fa64c 100644 --- a/docs/md_v2/basic/simple-crawling.md +++ b/docs/md_v2/basic/simple-crawling.md @@ -12,7 +12,9 @@ from crawl4ai import AsyncWebCrawler async def main(): async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com") + result = await crawler.arun( + url="https://example.com" + ) print(result.markdown) # Print clean markdown content if __name__ == "__main__": @@ -24,7 +26,7 @@ if __name__ == "__main__": The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details): ```python -result = await crawler.arun(url="https://example.com") +result = await crawler.arun(url="https://example.com", fit_markdown=True) # Different content formats print(result.html) # Raw HTML @@ -81,7 +83,7 @@ Here's a more comprehensive example showing common usage patterns: ```python import asyncio -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -97,7 +99,7 @@ async def main(): remove_overlay_elements=True, # Cache control - bypass_cache=False # Use cache if available + cache_mode=CacheMode.ENABLE # Use cache if available ) if result.success: diff --git a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md index 3682425f..a9f00e92 100644 --- a/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md +++ b/docs/md_v2/tutorial/episode_11_2_Extraction_Strategies_LLM.md @@ -52,7 +52,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove extraction_type="schema", instruction="Extract model names and fees for input and output tokens from the page." ), - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -98,7 +98,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove result = await crawler.arun( url="https://example.com/some-article", extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` diff --git a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md index 9f1c00ea..6100ae4c 100644 --- a/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md +++ b/docs/md_v2/tutorial/episode_11_3_Extraction_Strategies_Cosine.md @@ -55,7 +55,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -103,7 +103,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` diff --git a/docs/md_v2/tutorial/tutorial.md b/docs/md_v2/tutorial/tutorial.md index bf355ed0..7bead842 100644 --- a/docs/md_v2/tutorial/tutorial.md +++ b/docs/md_v2/tutorial/tutorial.md @@ -26,7 +26,7 @@ Here's a condensed outline of the **Installation and Setup** video content: - Walk through a simple test script to confirm the setup: ```python import asyncio - from crawl4ai import AsyncWebCrawler + from crawl4ai import AsyncWebCrawler, CacheMode async def main(): async with AsyncWebCrawler(verbose=True) as crawler: @@ -1093,7 +1093,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove extraction_type="schema", instruction="Extract model names and fees for input and output tokens from the page." ), - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1139,7 +1139,7 @@ Here’s a comprehensive outline for the **LLM Extraction Strategy** video, cove result = await crawler.arun( url="https://example.com/some-article", extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1248,7 +1248,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` @@ -1296,7 +1296,7 @@ Here’s a structured outline for the **Cosine Similarity Strategy** video, cove result = await crawler.arun( url=url, extraction_strategy=extraction_strategy, - bypass_cache=True + cache_mode=CacheMode.BYPASS ) print(result.extracted_content) ``` diff --git a/main.py b/main.py index 853cd0b7..6d217410 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,8 @@ from fastapi.exceptions import RequestValidationError from starlette.middleware.base import BaseHTTPMiddleware from starlette.responses import FileResponse from fastapi.responses import RedirectResponse +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from fastapi import Depends, Security from pydantic import BaseModel, HttpUrl, Field from typing import Optional, List, Dict, Any, Union @@ -23,7 +25,8 @@ import logging from enum import Enum from dataclasses import dataclass import json -from crawl4ai import AsyncWebCrawler, CrawlResult +from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode +from crawl4ai.config import MIN_WORD_THRESHOLD from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, @@ -51,18 +54,31 @@ class ExtractionConfig(BaseModel): type: CrawlerType params: Dict[str, Any] = {} +class ChunkingStrategy(BaseModel): + type: str + params: Dict[str, Any] = {} + +class ContentFilter(BaseModel): + type: str = "bm25" + params: Dict[str, Any] = {} + class CrawlRequest(BaseModel): urls: Union[HttpUrl, List[HttpUrl]] + word_count_threshold: int = MIN_WORD_THRESHOLD extraction_config: Optional[ExtractionConfig] = None - crawler_params: Dict[str, Any] = {} - priority: int = Field(default=5, ge=1, le=10) - ttl: Optional[int] = 3600 + chunking_strategy: Optional[ChunkingStrategy] = None + content_filter: Optional[ContentFilter] = None js_code: Optional[List[str]] = None wait_for: Optional[str] = None css_selector: Optional[str] = None screenshot: bool = False magic: bool = False extra: Optional[Dict[str, Any]] = {} + session_id: Optional[str] = None + cache_mode: Optional[CacheMode] = CacheMode.ENABLED + priority: int = Field(default=5, ge=1, le=10) + ttl: Optional[int] = 3600 + crawler_params: Dict[str, Any] = {} @dataclass class TaskInfo: @@ -276,12 +292,15 @@ class CrawlerService: if isinstance(request.urls, list): results = await crawler.arun_many( urls=[str(url) for url in request.urls], + word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy=extraction_strategy, js_code=request.js_code, wait_for=request.wait_for, css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) else: @@ -293,6 +312,8 @@ class CrawlerService: css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) @@ -321,7 +342,27 @@ app.add_middleware( # Mount the pages directory as a static directory app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") -app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") + +# API token security +security = HTTPBearer() +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" + +async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): + if not CRAWL4AI_API_TOKEN: + return credentials # No token verification if CRAWL4AI_API_TOKEN is not set + if credentials.credentials != CRAWL4AI_API_TOKEN: + raise HTTPException(status_code=401, detail="Invalid token") + return credentials + +# Helper function to conditionally apply security +def secure_endpoint(): + return Depends(verify_token) if CRAWL4AI_API_TOKEN else None + +# Check if site directory exists +if os.path.exists(__location__ + "/site"): + # Mount the site directory as a static directory + app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") + site_templates = Jinja2Templates(directory=__location__ + "/site") templates = Jinja2Templates(directory=__location__ + "/pages") @@ -337,15 +378,18 @@ async def shutdown_event(): @app.get("/") def read_root(): - return RedirectResponse(url="/mkdocs") + if os.path.exists(__location__ + "/site"): + return RedirectResponse(url="/mkdocs") + # Return a json response + return {"message": "Crawl4AI API service is running"} -@app.post("/crawl") +@app.post("/crawl", dependencies=[Depends(verify_token)]) async def crawl(request: CrawlRequest) -> Dict[str, str]: task_id = await crawler_service.submit_task(request) return {"task_id": task_id} -@app.get("/task/{task_id}") +@app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) async def get_task_status(task_id: str): task_info = crawler_service.task_manager.get_task(task_id) if not task_info: @@ -367,6 +411,71 @@ async def get_task_status(task_id: str): return response +@app.post("/crawl_sync", dependencies=[Depends(verify_token)]) +async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: + task_id = await crawler_service.submit_task(request) + + # Wait up to 60 seconds for task completion + for _ in range(60): + task_info = crawler_service.task_manager.get_task(task_id) + if not task_info: + raise HTTPException(status_code=404, detail="Task not found") + + if task_info.status == TaskStatus.COMPLETED: + # Return same format as /task/{task_id} endpoint + if isinstance(task_info.result, list): + return {"status": task_info.status, "results": [result.dict() for result in task_info.result]} + return {"status": task_info.status, "result": task_info.result.dict()} + + if task_info.status == TaskStatus.FAILED: + raise HTTPException(status_code=500, detail=task_info.error) + + await asyncio.sleep(1) + + # If we get here, task didn't complete within timeout + raise HTTPException(status_code=408, detail="Task timed out") + +@app.post("/crawl_direct", dependencies=[Depends(verify_token)]) +async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]: + try: + crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params) + extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config) + + try: + if isinstance(request.urls, list): + results = await crawler.arun_many( + urls=[str(url) for url in request.urls], + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"results": [result.dict() for result in results]} + else: + result = await crawler.arun( + url=str(request.urls), + extraction_strategy=extraction_strategy, + js_code=request.js_code, + wait_for=request.wait_for, + css_selector=request.css_selector, + screenshot=request.screenshot, + magic=request.magic, + cache_mode=request.cache_mode, + session_id=request.session_id, + **request.extra, + ) + return {"result": result.dict()} + finally: + await crawler_service.crawler_pool.release(crawler) + except Exception as e: + logger.error(f"Error in direct crawl: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + @app.get("/health") async def health_check(): available_slots = await crawler_service.resource_monitor.get_available_slots() diff --git a/middlewares.py b/middlewares.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mkdocs.yml b/mkdocs.yml index b09cb9eb..1b26b9df 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,6 +17,7 @@ nav: - 'Browser Configuration': 'basic/browser-config.md' - 'Page Interaction': 'basic/page-interaction.md' - 'Content Selection': 'basic/content-selection.md' + - 'Cache Modes': 'basic/cache-modes.md' - Advanced: - 'Content Processing': 'advanced/content-processing.md' diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index 7bc121a4..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,5 +0,0 @@ --r requirements.txt -pytest -pytest-asyncio -selenium -setuptools diff --git a/requirements.txt b/requirements.txt index 9a942958..ed259ac9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,9 @@ playwright>=1.47,<1.48 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 -playwright_stealth~=1.0 +tf-playwright-stealth~=1.0 +xxhash~=3.4 +rank-bm25~=0.2 +aiofiles~=24.0 +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 90063212..f5f3cf2d 100644 --- a/setup.py +++ b/setup.py @@ -5,34 +5,38 @@ from pathlib import Path import shutil import subprocess import sys +import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" +content_folders = ['html_content', 'cleaned_html', 'markdown_content', + 'extracted_content', 'screenshots'] +# Clean up old cache if exists if cache_folder.exists(): shutil.rmtree(cache_folder) +# Create new folder structure crawl4ai_folder.mkdir(exist_ok=True) cache_folder.mkdir(exist_ok=True) +for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) -# Read the requirements from requirements.txt +# Read requirements and version __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() -# Read version from __init__.py -with open("crawl4ai/_version.py") as f: +with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): version = line.split("=")[1].strip().strip('"') break -# Define the requirements for different environments +# Define requirements default_requirements = requirements -# torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"] -# transformer_requirements = ["transformers", "tokenizers", "onnxruntime"] torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk" ] @@ -50,10 +54,24 @@ def install_playwright(): print(f"Unexpected error during Playwright installation: {e}") print("Please run 'python -m playwright install' manually after the installation.") +def run_migration(): + """Initialize database during installation""" + try: + print("Starting database initialization...") + from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) + print("Database initialization completed successfully.") + except ImportError: + print("Warning: Database module not found. Will initialize on first use.") + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() + # run_migration() setup( name="Crawl4AI", @@ -66,7 +84,7 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright"], # Add playwright to default requirements + install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, @@ -77,6 +95,7 @@ setup( entry_points={ 'console_scripts': [ 'crawl4ai-download-models=crawl4ai.model_loader:main', + 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command ], }, classifiers=[ diff --git a/tests/async/sample_wikipedia.html b/tests/async/sample_wikipedia.html new file mode 100644 index 00000000..a22b3e3f --- /dev/null +++ b/tests/async/sample_wikipedia.html @@ -0,0 +1,2179 @@ + + +Apple - Wikipedia + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Jump to content +
+
+
+ + + + +
+
+ + + + + +
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+ +

Apple

+ +
+ + +
+ +
+ + + +
+ +
+
+
+
+
+
+ +
+
+ + + +
+
+
+
+
+ + +
+
+
+
+
+
This is a good article. Click here for more information.
+
Page semi-protected
+
+ +
From Wikipedia, the free encyclopedia
+
+
+ + +
+ + +

+ + + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Apple +
+
'Cripps Pink' apples +
+
Flowers +
Scientific classification Edit this classification +
Kingdom: +Plantae +
Clade: +Tracheophytes +
Clade: +Angiosperms +
Clade: +Eudicots +
Clade: +Rosids +
Order: +Rosales +
Family: +Rosaceae +
Genus: +Malus +
Species: +
M. domestica
+
Binomial name +
Malus domestica
+
Synonyms[1][2] +
+
  • M. communis Desf., 1768
  • +
  • M. pumila Mil.
  • +
  • M. frutescens Medik.
  • +
  • M. paradisiaca (L.) Medikus
  • +
  • M. sylvestris Mil.
  • +
  • Pyrus malus L.
  • +
  • Pyrus malus var. paradisiaca L.
  • +
  • Pyrus dioica Moench
+
+

An apple is a round, edible fruit produced by an apple tree (Malus spp., among them the domestic or orchard apple; Malus domestica). Apple trees are cultivated worldwide and are the most widely grown species in the genus Malus. The tree originated in Central Asia, where its wild ancestor, Malus sieversii, is still found. Apples have been grown for thousands of years in Eurasia and were introduced to North America by European colonists. Apples have religious and mythological significance in many cultures, including Norse, Greek, and European Christian tradition. +

Apples grown from seed tend to be very different from those of their parents, and the resultant fruit frequently lacks desired characteristics. For commercial purposes, including botanical evaluation, apple cultivars are propagated by clonal grafting onto rootstocks. Apple trees grown without rootstocks tend to be larger and much slower to fruit after planting. Rootstocks are used to control the speed of growth and the size of the resulting tree, allowing for easier harvesting. +

There are more than 7,500 cultivars of apples. Different cultivars are bred for various tastes and uses, including cooking, eating raw, and cider or apple juice production. Trees and fruit are prone to fungal, bacterial, and pest problems, which can be controlled by a number of organic and non-organic means. In 2010, the fruit's genome was sequenced as part of research on disease control and selective breeding in apple production. +

+ +

Etymology

+

The word apple, whose Old English ancestor is æppel, is descended from the Proto-Germanic noun *aplaz, descended in turn from Proto-Indo-European *h₂ébōl.[3] As late as the 17th century, the word also functioned as a generic term for all fruit, including nuts. This can be compared to the 14th-century Middle English expression appel of paradis, meaning a banana.[4] +

+

Description

+

The apple is a deciduous tree, generally standing 2 to 4.5 metres (6 to 15 feet) tall in cultivation and up to 15 m (49 ft) in the wild, though more typically 2 to 10 m (6.5 to 33 ft).[5][1] When cultivated, the size, shape and branch density are determined by rootstock selection and trimming method.[5] Apple trees may naturally have a rounded to erect crown with a dense canopy of leaves.[6] The bark of the trunk is dark gray or gray-brown, but young branches are reddish or dark-brown with a smooth texture.[1][7] When young twigs are covered in very fine downy hairs and become hairless as they become older.[7] +

The buds are egg-shaped and dark red or purple in color; they range in size from 3 to 5 millimeters, but are usually less than 4 mm. The bud scales have very hairy edges. When emerging from the buds, the leaves are convolute, meaning that their edges overlap each other.[1] Leaves can be simple ovals (elliptic), medium or wide in width, somewhat egg-shaped with the wider portion toward their base (ovate), or even with sides that are more parallel to each other instead of curved (oblong) with a narrow pointed end.[7][1] The edges have broadly-angled teeth, but do not have lobes. The top surface of the leaves are glabrescent, almost hairless, while the undersides are densely covered in fine hairs.[1] The leaves are attached alternately by short leaf stems 1-to-3.5 cm (12-to-1+12 in) long.[6][1] +

Blossoms are produced in spring simultaneously with the budding of the leaves and are produced on spurs and some long shoots.[5] When the flower buds first begin to open the petals are rose-pink and fade to white or light pink when fully open with each flower 3-to-4-centimeter (1-to-1+12-inch) in diameter.[1] The five-petaled flowers are group in an inflorescence consisting of a cyme with 3–7 flowers.[8] The central flower of the inflorescence is called the "king bloom"; it opens first and can develop a larger fruit.[6] Open apple blossoms are damaged by even brief exposures to temperatures −2 °C (28 °F) or less, although the overwintering wood and buds are hardy down to −40 °C (−40 °F).[8] +

+ +

Fruit

+

The fruit is a pome that matures in late summer or autumn.[1] The true fruits or carpels are the harder interior chambers inside the apple's core. There are usually five carpels inside an apple, but there may be as few as three. Each of the chambers contains one or two seeds.[9] The edible flesh is formed from the receptacle at the base of the flower.[10] +

+ +

The seeds are egg- to pear-shaped and may be colored from light brown or tan to a very dark brown, often with red shades or even purplish-black. They may have a blunt or sharp point.[11] The five sepals remain attached and stand out from the surface of the apple.[1] +

The size of the fruit varies widely between cultivars, but generally has a diameter between 2.5 and 12 cm (1 and 5 in).[7] The shape is quite variable and may be nearly round, elongated, conical, or short and wide.[12] +

The groundcolor of ripe apples is yellow, green, yellow-green or whitish yellow. The overcolor of ripe apples can be orange-red, pink-red, red, purple-red or brown-red. The overcolor amount can be 0–100%.[13] The skin may be wholly or partly russeted, making it rough and brown. The skin is covered in a protective layer of epicuticular wax.[14] The skin may also be marked with scattered dots.[1] The flesh is generally pale yellowish-white, though it can be pink, yellow or green.[13] +

+ +

Chemistry

+

Important volatile compounds in apples that contribute to their scent and flavour include acetaldehyde, ethyl acetate, 1-butanal, ethanol, 2-methylbutanal, 3-methylbutanal, ethyl propionate, ethyl 2-methylpropionate, ethyl butyrate, ethyl 2-methyl butyrate, hexanal, 1-butanol, 3-methylbutyl acetate, 2-methylbutyl acetate, 1-propyl butyrate, ethyl pentanoate, amyl acetate, 2-methyl-1-butanol, trans-2-hexenal, ethyl hexanoate, hexanol.[15][16] +

+

Taxonomy

+

The apple as a species has more than 100 alternative scientific names, or synonyms.[17] In modern times, Malus pumila and Malus domestica are the two main names in use. M. pumila is the older name, but M. domestica has become much more commonly used starting in the 21st century, especially in the western world. Two proposals were made to make M. domestica a conserved name: the earlier proposal was voted down by the Committee for Vascular Plants of the IAPT in 2014, but in April 2017 the Committee decided, with a narrow majority, that the newly popular name should be conserved.[18] The General Committee of the IAPT decided in June 2017 to approve this change, officially conserving M. domestica.[19] Nevertheless, some works published after 2017 still use M. pumila as the correct name, under an alternate taxonomy.[2] +

When first classified by Linnaeus in 1753, the pears, apples, and quinces were combined into one genus that he named Pyrus and he named the apple as Pyrus malus. This was widely accepted, however the botanist Philip Miller published an alternate classification in The Gardeners Dictionary with the apple species separated from Pyrus in 1754. He did not clearly indicate that by Malus pumila he meant the domesticated apple. Nonetheless, it was used as such by many botanists. When Moritz Balthasar Borkhausen published his scientific description of the apple in 1803 it may have been a new combination of P. malus var. domestica, but this was not directly referenced by Borkhausen.[17] The earliest use of var. domestica for the apple was by Georg Adolf Suckow in 1786.[2] +

+

Genome

+ +

Apples are diploid, with two sets of chromosomes per cell (though triploid cultivars, with three sets, are not uncommon), have 17 chromosomes and an estimated genome size of approximately 650 Mb. Several whole genome sequences have been completed and made available. The first one in 2010 was based on the diploid cultivar 'Golden Delicious'.[20] However, this first whole genome sequence contained several errors,[21] in part owing to the high degree of heterozygosity in diploid apples which, in combination with an ancient genome duplication, complicated the assembly. Recently, double- and trihaploid individuals have been sequenced, yielding whole genome sequences of higher quality.[22][23] +

The first whole genome assembly was estimated to contain around 57,000 genes,[20] though the more recent genome sequences support estimates between 42,000 and 44,700 protein-coding genes.[22][23] The availability of whole genome sequences has provided evidence that the wild ancestor of the cultivated apple most likely is Malus sieversii. Re-sequencing of multiple accessions has supported this, while also suggesting extensive introgression from Malus sylvestris following domestication.[24] +

+

Cultivation

+

History

+
Map of the origins of the cultivated apple. The wild origin is in Kazakhstan; hybridisations and repeated domestications followed, modifying many attributes of the fruit.[24]
+
color photograph of a hand holding a red apple
Wild Malus sieversii apple in Kazakhstan
+

Central Asia is generally considered the center of origin for apples due to the genetic variability in specimens there.[25] The wild ancestor of Malus domestica was Malus sieversii, found growing wild in the mountains of Central Asia in southern Kazakhstan, Kyrgyzstan, Tajikistan, and northwestern China.[5][26] Cultivation of the species, most likely beginning on the forested flanks of the Tian Shan mountains, progressed over a long period of time and permitted secondary introgression of genes from other species into the open-pollinated seeds. Significant exchange with Malus sylvestris, the crabapple, resulted in populations of apples being more related to crabapples than to the more morphologically similar progenitor Malus sieversii. In strains without recent admixture the contribution of the latter predominates.[27][28][29] +

The apple is thought to have been domesticated 4,000–10,000 years ago in the Tian Shan mountains, and then to have travelled along the Silk Road to Europe, with hybridization and introgression of wild crabapples from Siberia (M. baccata), the Caucasus (M. orientalis), and Europe (M. sylvestris). Only the M. sieversii trees growing on the western side of the Tian Shan mountains contributed genetically to the domesticated apple, not the isolated population on the eastern side.[24] +

Chinese soft apples, such as M. asiatica and M. prunifolia, have been cultivated as dessert apples for more than 2,000 years in China. These are thought to be hybrids between M. baccata and M. sieversii in Kazakhstan.[24] +

Among the traits selected for by human growers are size, fruit acidity, color, firmness, and soluble sugar. Unusually for domesticated fruits, the wild M. sieversii origin is only slightly smaller than the modern domesticated apple.[24] +

At the Sammardenchia-Cueis site near Udine in Northeastern Italy, seeds from some form of apples have been found in material carbon dated to between 6570 and 5684 BCE.[30] Genetic analysis has not yet been successfully used to determine whether such ancient apples were wild Malus sylvestris or Malus domesticus containing Malus sieversii ancestry. It is hard to distinguish in the archeological record between foraged wild apples and apple plantations.[31] +

There is indirect evidence of apple cultivation in the third millennium BCE in the Middle East.[31] There is direct evidence, apple cores, dated to the 10th century BCE from a Judean site between the Sinai and Negev. +[32] There was substantial apple production in European classical antiquity, and grafting was certainly known then.[31] Grafting is an essential part of modern domesticated apple production, to be able to propagate the best cultivars; it is unclear when apple tree grafting was invented.[31] +

+ +

The Roman writer Pliny the Elder describes a method of storage for apples from his time in the 1st century. He says they should be placed in a room with good air circulation from a north facing window on a bed of straw, chaff, or mats with windfalls kept separately.[33] Though methods like this will extend the availabity of reasonably fresh apples, without refrigeration their lifespan is limited. Even sturdy winter apple varieties will only keep well until December in cool climates.[34] For longer storage medieval Europeans strung up cored and peeled apples to dry, either whole or sliced into rings.[35] +

Of the many Old World plants that the Spanish introduced to Chiloé Archipelago in the 16th century, apple trees became particularly well adapted.[36] Apples were introduced to North America by colonists in the 17th century,[5] and the first named apple cultivar was introduced in Boston by Reverend William Blaxton in 1640.[37] The only apples native to North America are crab apples.[38] +

Apple cultivars brought as seed from Europe were spread along Native American trade routes, as well as being cultivated on colonial farms. An 1845 United States apples nursery catalogue sold 350 of the "best" cultivars, showing the proliferation of new North American cultivars by the early 19th century.[38] In the 20th century, irrigation projects in Eastern Washington began and allowed the development of the multibillion-dollar fruit industry, of which the apple is the leading product.[5] +

Until the 20th century, farmers stored apples in frostproof cellars during the winter for their own use or for sale. Improved transportation of fresh apples by train and road replaced the necessity for storage.[39][40] Controlled atmosphere facilities are used to keep apples fresh year-round. Controlled atmosphere facilities use high humidity, low oxygen, and controlled carbon dioxide levels to maintain fruit freshness. They were first researched at Cambridge University in the 1920s and first used in the United States in the 1950s.[41] +

+

Breeding

+ +
An apple tree in Germany
+

Many apples grow readily from seeds. However, apples must be propagated asexually to obtain cuttings with the characteristics of the parent. This is because seedling apples are "extreme heterozygotes". Rather than resembling their parents, seedlings are all different from each other and from their parents.[42] Triploid cultivars have an additional reproductive barrier in that three sets of chromosomes cannot be divided evenly during meiosis, yielding unequal segregation of the chromosomes (aneuploids). Even in the case when a triploid plant can produce a seed (apples are an example), it occurs infrequently, and seedlings rarely survive.[43] +

Because apples are not true breeders when planted as seeds, propagation usually involves grafting of cuttings. The rootstock used for the bottom of the graft can be selected to produce trees of a large variety of sizes, as well as changing the winter hardiness, insect and disease resistance, and soil preference of the resulting tree. Dwarf rootstocks can be used to produce very small trees (less than 3.0 m or 10 ft high at maturity), which bear fruit many years earlier in their life cycle than full size trees, and are easier to harvest.[44] +

Dwarf rootstocks for apple trees can be traced as far back as 300 BCE, to the area of Persia and Asia Minor. Alexander the Great sent samples of dwarf apple trees to Aristotle's Lyceum. Dwarf rootstocks became common by the 15th century and later went through several cycles of popularity and decline throughout the world.[45] The majority of the rootstocks used to control size in apples were developed in England in the early 1900s. The East Malling Research Station conducted extensive research into rootstocks, and their rootstocks are given an "M" prefix to designate their origin. Rootstocks marked with an "MM" prefix are Malling-series cultivars later crossed with trees of 'Northern Spy' in Merton, England.[46] +

Most new apple cultivars originate as seedlings, which either arise by chance or are bred by deliberately crossing cultivars with promising characteristics.[47] The words "seedling", "pippin", and "kernel" in the name of an apple cultivar suggest that it originated as a seedling. Apples can also form bud sports (mutations on a single branch). Some bud sports turn out to be improved strains of the parent cultivar. Some differ sufficiently from the parent tree to be considered new cultivars.[48] +

Apples have been acclimatized in Ecuador at very high altitudes, where they can often, with the needed factors, provide crops twice per year because of constant temperate conditions year-round.[49] +

+

Pollination

+ +
Apple blossom from an old Ayrshire cultivar
+
An orchard mason bee on an apple bloom in British Columbia, Canada
+

Apples are self-incompatible; they must cross-pollinate to develop fruit. During the flowering each season, apple growers often utilize pollinators to carry pollen. Honey bees are most commonly used. Orchard mason bees are also used as supplemental pollinators in commercial orchards. Bumblebee queens are sometimes present in orchards, but not usually in sufficient number to be significant pollinators.[48][50] +

Cultivars are sometimes classified by the day of peak bloom in the average 30-day blossom period, with pollinizers selected from cultivars within a 6-day overlap period. There are four to seven pollination groups in apples, depending on climate: +

+
  • Group A – Early flowering, 1 to 3 May in England ('Gravenstein', 'Red Astrachan')
  • +
  • Group B – 4 to 7 May ('Idared', 'McIntosh')
  • +
  • Group C – Mid-season flowering, 8 to 11 May ('Granny Smith', 'Cox's Orange Pippin')
  • +
  • Group D – Mid/late season flowering, 12 to 15 May ('Golden Delicious', 'Calville blanc d'hiver')
  • +
  • Group E – Late flowering, 16 to 18 May ('Braeburn', 'Reinette d'Orléans')
  • +
  • Group F – 19 to 23 May ('Suntan')
  • +
  • Group H – 24 to 28 May ('Court-Pendu Gris' – also called Court-Pendu plat)
+

One cultivar can be pollinated by a compatible cultivar from the same group or close (A with A, or A with B, but not A with C or D).[51] +

+

Maturation and harvest

+ +
L. K. Relander, the former President of Finland, with his family picking apples in the 1930s
+

Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock. Some cultivars, if left unpruned, grow very large—letting them bear more fruit, but making harvesting more difficult. Depending on tree density (number of trees planted per unit surface area), mature trees typically bear 40–200 kg (90–440 lb) of apples each year, though productivity can be close to zero in poor years. Apples are harvested using three-point ladders that are designed to fit amongst the branches. Trees grafted on dwarfing rootstocks bear about 10–80 kg (20–180 lb) of fruit per year.[48] +

Some farms with apple orchards open them to the public so consumers can pick their own apples.[52] +

Crops ripen at different times of the year according to the cultivar. Cultivar that yield their crop in the summer include 'Sweet Bough' and 'Duchess'; fall producers include 'Blenheim'; winter producers include 'King', 'Swayzie', and 'Tolman Sweet'.[38] +

+

Storage

+
Different apple cultivars in a wholesale food market
+

Commercially, apples can be stored for months in controlled atmosphere chambers. Apples are commonly stored in chambers with lowered concentrations of oxygen to reduce respiration and slow softening and other changes if the fruit is already fully ripe. The gas ethylene is used by plants as a hormone which promotes ripening, decreasing the time an apple can be stored. For storage longer than about six months the apples are picked earlier, before full ripeness, when ethylene production by the fruit is low. However, in many varieties this increases their sensitivity to carbon dioxide, which also must be controlled.[53] +

For home storage, most culitvars of apple can be stored for three weeks in a pantry and four to six weeks from the date of purchase in a refrigerator that maintains 4 to 0 °C (39 to 32 °F).[54][55] Some varieties of apples (e.g. 'Granny Smith' and 'Fuji') have more than three times the storage life of others.[56] +

Non-organic apples may be sprayed with a substance 1-methylcyclopropene blocking the apples' ethylene receptors, temporarily preventing them from ripening.[57] +

+

Pests and diseases

+ +
Codling moth larva tunnelling inside an apple
+

Apple trees are susceptible to fungal and bacterial diseases, and to damage by insect pests. Many commercial orchards pursue a program of chemical sprays to maintain high fruit quality, tree health, and high yields. These prohibit the use of synthetic pesticides, though some older pesticides are allowed. Organic methods include, for instance, introducing its natural predator to reduce the population of a particular pest. +

A wide range of pests and diseases can affect the plant. Three of the more common diseases or pests are mildew, aphids, and apple scab. +

+
  • Mildew is characterized by light grey powdery patches appearing on the leaves, shoots and flowers, normally in spring. The flowers turn a creamy yellow color and do not develop correctly. This can be treated similarly to Botrytis—eliminating the conditions that caused the disease and burning the infected plants are among recommended actions.[58]
  • +
  • Aphids are small insects with sucking mouthparts. Five species of aphids commonly attack apples: apple grain aphid, rosy apple aphid, apple aphid, spirea aphid, and the woolly apple aphid. The aphid species can be identified by color, time of year, and by differences in the cornicles (small paired projections from their rear).[59] Aphids feed on foliage using needle-like mouth parts to suck out plant juices. When present in high numbers, certain species reduce tree growth and vigor.[60]
  • +
  • Apple scab: Apple scab causes leaves to develop olive-brown spots with a velvety texture that later turn brown and become cork-like in texture. The disease also affects the fruit, which also develops similar brown spots with velvety or cork-like textures. Apple scab is spread through fungus growing in old apple leaves on the ground and spreads during warm spring weather to infect the new year's growth.[61]
+

Among the most serious disease problems is a bacterial disease called fireblight, and three fungal diseases: Gymnosporangium rust, black spot,[62] and bitter rot.[63] Codling moths, and the apple maggots of fruit flies, cause serious damage to apple fruits, making them unsaleable. Young apple trees are also prone to mammal pests like mice and deer, which feed on the soft bark of the trees, especially in winter.[61] The larvae of the apple clearwing moth (red-belted clearwing) burrow through the bark and into the phloem of apple trees, potentially causing significant damage.[64] +

+

Cultivars

+ +
An assortment of apple cultivars
+

There are more than 7,500 known cultivars (cultivated varieties) of apples.[65] Cultivars vary in their yield and the ultimate size of the tree, even when grown on the same rootstock.[66] Different cultivars are available for temperate and subtropical climates. The UK's National Fruit Collection, which is the responsibility of the Department of Environment, Food, and Rural Affairs, includes a collection of over 2,000 cultivars of apple tree in Kent.[67] The University of Reading, which is responsible for developing the UK national collection database, provides access to search the national collection. The University of Reading's work is part of the European Cooperative Programme for Plant Genetic Resources of which there are 38 countries participating in the Malus/Pyrus work group.[68] +

The UK's national fruit collection database contains much information on the characteristics and origin of many apples, including alternative names for what is essentially the same "genetic" apple cultivar. Most of these cultivars are bred for eating fresh (dessert apples), though some are cultivated specifically for cooking (cooking apples) or producing cider. Cider apples are typically too tart and astringent to eat fresh, but they give the beverage a rich flavor that dessert apples cannot.[69] +

In the United States there are many apple breeding programs associated with universities. Cornell University has had a program operating since 1880 in Geneva, New York. Among their recent well known apples is the 'SnapDragon' cultivar released in 2013. In the west Washington State University started a program to support their apple industry in 1994 and released the 'Cosmic Crisp' cultivar in 2017. The third most grown apple cultivar in the United States is the 'Honeycrisp', released by the University of Minnesota program in 1991.[70] Unusually for a popular cultivar, the 'Honeycrisp' is not directly related to another popular apple cultivar but instead to two unsuccessful cultivars.[71] In Europe there are also many breeding programs such as the Julius Kühn-Institut, the German federal research center for cultivated plants.[72] +

Commercially popular apple cultivars are soft but crisp. Other desirable qualities in modern commercial apple breeding are a colorful skin, absence of russeting, ease of shipping, lengthy storage ability, high yields, disease resistance, common apple shape, and developed flavor.[66] Modern apples are generally sweeter than older cultivars, as popular tastes in apples have varied over time. Most North Americans and Europeans favor sweet, subacid apples, but tart apples have a strong minority following.[73] Extremely sweet apples with barely any acid flavor are popular in Asia,[73] especially the Indian subcontinent.[69] +

+
Less common apple cultivars from an orchard in Italy
+

Old cultivars are often oddly shaped, russeted, and grow in a variety of textures and colors. Some find them to have better flavor than modern cultivars, but they may have other problems that make them commercially unviable—low yield, disease susceptibility, poor tolerance for storage or transport, or just being the "wrong" size.[74] A few old cultivars are still produced on a large scale, but many have been preserved by home gardeners and farmers that sell directly to local markets. Many unusual and locally important cultivars with their own unique taste and appearance exist; apple conservation campaigns have sprung up around the world to preserve such local cultivars from extinction. In the United Kingdom, old cultivars such as 'Cox's Orange Pippin' and 'Egremont Russet' are still commercially important even though by modern standards they are low yielding and susceptible to disease.[5] +

+

Production

+ + + + + + + + + + + + + + + + + + + + + + + +
Apple production
+

2022, millions of tonnes
+

+
 China47.6 +
 United States4.8 +
 Turkey4.4 +
 Poland4.3 +
 India2.6 +
World95.8 +
Source: FAOSTAT of the United Nations[75] +
+

World production of apples in 2022 was 96 million tonnes, with China producing 50% of the total (table).[75] Secondary producers were the United States, Turkey, and Poland.[75] +

+

Toxicity

+

Amygdalin

+

Apple seeds contain small amounts of amygdalin, a sugar and cyanide compound known as a cyanogenic glycoside. Ingesting small amounts of apple seeds causes no ill effects, but consumption of extremely large doses can cause adverse reactions. It may take several hours before the poison takes effect, as cyanogenic glycosides must be hydrolyzed before the cyanide ion is released.[76] The U.S. National Library of Medicine's Hazardous Substances Data Bank records no cases of amygdalin poisoning from consuming apple seeds.[77] +

+

Allergy

+

One form of apple allergy, often found in northern Europe, is called birch-apple syndrome and is found in people who are also allergic to birch pollen.[78] Allergic reactions are triggered by a protein in apples that is similar to birch pollen, and people affected by this protein can also develop allergies to other fruits, nuts, and vegetables. Reactions, which entail oral allergy syndrome (OAS), generally involve itching and inflammation of the mouth and throat,[78] but in rare cases can also include life-threatening anaphylaxis.[79] This reaction only occurs when raw fruit is consumed—the allergen is neutralized in the cooking process. The variety of apple, maturity and storage conditions can change the amount of allergen present in individual fruits. Long storage times can increase the amount of proteins that cause birch-apple syndrome.[78] +

In other areas, such as the Mediterranean, some individuals have adverse reactions to apples because of their similarity to peaches.[78] This form of apple allergy also includes OAS, but often has more severe symptoms, such as vomiting, abdominal pain and urticaria, and can be life-threatening. Individuals with this form of allergy can also develop reactions to other fruits and nuts. Cooking does not break down the protein causing this particular reaction, so affected individuals cannot eat raw or cooked apples. Freshly harvested, over-ripe fruits tend to have the highest levels of the protein that causes this reaction.[78] +

Breeding efforts have yet to produce a hypoallergenic fruit suitable for either of the two forms of apple allergy.[78] +

+

Uses

+ +

Nutrition

+
+ +
Apples, with skin (edible parts)
Nutritional value per 100 g (3.5 oz)
Energy218 kJ (52 kcal)
13.81 g
Sugars10.39
Dietary fiber2.4 g
+
0.17 g
+
0.26 g
+ + + + +
Vitamins and minerals
+
VitaminsQuantity
%DV
Vitamin A equiv.
0%
3 μg
0%
27 μg
29 μg
Thiamine (B1)
1%
0.017 mg
Riboflavin (B2)
2%
0.026 mg
Niacin (B3)
1%
0.091 mg
Pantothenic acid (B5)
1%
0.061 mg
Vitamin B6
2%
0.041 mg
Folate (B9)
1%
3 μg
Vitamin C
5%
4.6 mg
Vitamin E
1%
0.18 mg
Vitamin K
2%
2.2 μg
+
MineralsQuantity
%DV
Calcium
0%
6 mg
Iron
1%
0.12 mg
Magnesium
1%
5 mg
Manganese
2%
0.035 mg
Phosphorus
1%
11 mg
Potassium
4%
107 mg
Sodium
0%
1 mg
Zinc
0%
0.04 mg
+
Other constituentsQuantity
Water85.56 g
+

Percentages estimated using US recommendations for adults,[80] except for potassium, which is estimated based on expert recommendation from the National Academies.[81]
+
+

A raw apple is 86% water and 14% carbohydrates, with negligible content of fat and protein (table). A reference serving of a raw apple with skin weighing 100 g (3.5 oz) provides 52 calories and a moderate content of dietary fiber (table). Otherwise, there is low content of micronutrients, with the Daily Values of all falling below 10% (table). +

+

Culinary

+ +
Machine for paring, coring, and slicing apples, from Henry B. Scammell's 1897 handbook Cyclopedia of Valuable Receipts
+

Apples varieties can be grouped as cooking apples, eating apples, and cider apples, the last so astringent as to be "almost inedible".[82] Apples are consumed as juice, raw in salads, baked in pies, cooked into sauces and apple butter, or baked.[83] They are sometimes used as an ingredient in savory foods, such as sausage and stuffing.[84] +

Several techniques are used to preserve apples and apple products. Traditional methods include drying and making apple butter.[82] Juice and cider are produced commercially; cider is a significant industry in regions such as the West of England and Normandy.[82] +

A toffee apple (UK) or caramel apple (US) is a confection made by coating an apple in hot toffee or caramel candy respectively and allowing it to cool.[85][8] Apples and honey are a ritual food pairing eaten during the Jewish New Year of Rosh Hashanah.[86] +

Apples are an important ingredient in many desserts, such as pies, crumbles, and cakes. When cooked, some apple cultivars easily form a puree known as apple sauce, which can be cooked down to form a preserve, apple butter. They are often baked or stewed, and are cooked in some meat dishes.[82] +

+ +

Apples are milled or pressed to produce apple juice, which may be drunk unfiltered (called apple cider in North America), or filtered. Filtered juice is often concentrated and frozen, then reconstituted later and consumed. Apple juice can be fermented to make cider (called hard cider in North America), ciderkin, and vinegar.[8] Through distillation, various alcoholic beverages can be produced, such as applejack, Calvados, and apple brandy.[8][87] +

+

Organic production

+

Organic apples are commonly produced in the United States.[88] Due to infestations by key insects and diseases, organic production is difficult in Europe.[89] The use of pesticides containing chemicals, such as sulfur, copper, microorganisms, viruses, clay powders, or plant extracts (pyrethrum, neem) has been approved by the EU Organic Standing Committee to improve organic yield and quality.[89] A light coating of kaolin, which forms a physical barrier to some pests, also may help prevent apple sun scalding.[48] +

+

Non-browning apples

+

Apple skins and seeds contain polyphenols.[90] These are oxidised by the enzyme polyphenol oxidase, which causes browning in sliced or bruised apples, by catalyzing the oxidation of phenolic compounds to o-quinones, a browning factor.[91] Browning reduces apple taste, color, and food value. Arctic apples, a non-browning group of apples introduced to the United States market in 2019, have been genetically modified to silence the expression of polyphenol oxidase, thereby delaying a browning effect and improving apple eating quality.[92][93] The US Food and Drug Administration in 2015, and Canadian Food Inspection Agency in 2017, determined that Arctic apples are as safe and nutritious as conventional apples.[94][95] +

+

Other products

+

Apple seed oil is obtained by pressing apple seeds for manufacturing cosmetics.[96] +

+

In culture

+ +

Germanic paganism

+
Illustration of girl in a red dress, holding 3 candles in one hand and a basket of apples in the other
"Brita as Iduna" (1901) by Carl Larsson
+

In Norse mythology, the goddess Iðunn is portrayed in the Prose Edda (written in the 13th century by Snorri Sturluson) as providing apples to the gods that give them eternal youthfulness. The English scholar H. R. Ellis Davidson links apples to religious practices in Germanic paganism, from which Norse paganism developed. She points out that buckets of apples were found in the Oseberg ship burial site in Norway, that fruit and nuts (Iðunn having been described as being transformed into a nut in Skáldskaparmál) have been found in the early graves of the Germanic peoples in England and elsewhere on the continent of Europe, which may have had a symbolic meaning, and that nuts are still a recognized symbol of fertility in southwest England.[97] +

Davidson notes a connection between apples and the Vanir, a tribe of gods associated with fertility in Norse mythology, citing an instance of eleven "golden apples" being given to woo the beautiful Gerðr by Skírnir, who was acting as messenger for the major Vanir god Freyr in stanzas 19 and 20 of Skírnismál. Davidson also notes a further connection between fertility and apples in Norse mythology in chapter 2 of the Völsunga saga: when the major goddess Frigg sends King Rerir an apple after he prays to Odin for a child, Frigg's messenger (in the guise of a crow) drops the apple in his lap as he sits atop a mound.[97] Rerir's wife's consumption of the apple results in a six-year pregnancy and the birth (by Caesarean section) of their son—the hero Völsung.[98] +

Further, Davidson points out the "strange" phrase "Apples of Hel" used in an 11th-century poem by the skald Thorbiorn Brúnarson. She states this may imply that the apple was thought of by Brúnarson as the food of the dead. Further, Davidson notes that the potentially Germanic goddess Nehalennia is sometimes depicted with apples and that parallels exist in early Irish stories. Davidson asserts that while cultivation of the apple in Northern Europe extends back to at least the time of the Roman Empire and came to Europe from the Near East, the native varieties of apple trees growing in Northern Europe are small and bitter. Davidson concludes that in the figure of Iðunn "we must have a dim reflection of an old symbol: that of the guardian goddess of the life-giving fruit of the other world."[97] +

+

Greek mythology

+
Heracles with the apple of Hesperides
+

Apples appear in many religious traditions, including Greek and Roman mythology where it has an ambiguous symbolism of discord, fertility, or courtship.[99] In Greek mythology, the Greek hero Heracles, as a part of his Twelve Labours, was required to travel to the Garden of the Hesperides and pick the golden apples off the Tree of Life growing at its center.[100] +

The Greek goddess of discord, Eris, became disgruntled after she was excluded from the wedding of Peleus and Thetis.[101] In retaliation, she tossed a golden apple inscribed Καλλίστη (Kallistē, "For the most beautiful one"), into the wedding party. Three goddesses claimed the apple: Hera, Athena, and Aphrodite. Paris of Troy was appointed to select the recipient. After being bribed by both Hera and Athena, Aphrodite tempted him with the most beautiful woman in the world, Helen of Sparta. He awarded the apple to Aphrodite, thus indirectly causing the Trojan War.[102][103] +

The apple was thus considered, in ancient Greece, sacred to Aphrodite. To throw an apple at someone was to symbolically declare one's love; and similarly, to catch it was to symbolically show one's acceptance of that love. An epigram claiming authorship by Plato states:[104] +

+

I throw the apple at you, and if you are willing to love me, take it and share your girlhood with me; but if your thoughts are what I pray they are not, even then take it, and consider how short-lived is beauty.

— Plato, Epigram VII
+

Atalanta, also of Greek mythology, raced all her suitors in an attempt to avoid marriage. She outran all but Hippomenes (also known as Melanion, a name possibly derived from melon, the Greek word for both "apple" and fruit in general),[100] who defeated her by cunning, not speed. Hippomenes knew that he could not win in a fair race, so he used three golden apples (gifts of Aphrodite, the goddess of love) to distract Atalanta. It took all three apples and all of his speed, but Hippomenes was finally successful, winning the race and Atalanta's hand.[105][106] +

+

Celtic mythology

+

In Celtic mythology, the otherworld has many names, including Emain Ablach, "Emain of the Apple-trees". A version of this is Avalon in Arthurian legend, or in Welsh Ynys Afallon, "Island of Apples".[107] +

+

China

+
Píngānguǒ ("Peace apples") on sale in Beijing for Christmas Eve (2017)
+

In China, apples symbolise peace, since the sounds of the first element ("píng") in the words "apple" (苹果, Píngguǒ) and "peace" (平安, Píng'ān) are homophonous in Mandarin and Cantonese.[3][108] When these two words are combined, the word Píngānguǒ (平安果, "Peace apples") is formed. This association developed further as the name for Christmas Eve in Mandarin is Píngānyè (平安夜, "Peaceful/Quiet Evening"), which made the gifting of apples at this season to friends and associates popular, as a way to wish them peace and safety.[108] +

+

Christian art

+
Adam and Eve by Albrecht Dürer (1507), showcasing the apple as a symbol of sin
+

Though the forbidden fruit of Eden in the Book of Genesis is not identified, popular Christian tradition has held that it was an apple that Eve coaxed Adam to share with her.[109] The origin of the popular identification with a fruit unknown in the Middle East in biblical times is found in wordplay with the Latin words mālum (an apple) and mălum (an evil), each of which is normally written malum.[110] The tree of the forbidden fruit is called "the tree of the knowledge of good and evil" in Genesis 2:17,[111] and the Latin for "good and evil" is bonum et malum.[112] +

Renaissance painters may also have been influenced by the story of the golden apples in the Garden of Hesperides. As a result, in the story of Adam and Eve, the apple became a symbol for knowledge, immortality, temptation, the fall of man into sin, and sin itself. The larynx in the human throat has been called the "Adam's apple" because of a notion that it was caused by the forbidden fruit remaining in the throat of Adam. The apple as symbol of sexual seduction has been used to imply human sexuality, possibly in an ironic vein.[109] +

+

Proverb

+

The proverb, "An apple a day keeps the doctor away", addressing the supposed health benefits of the fruit, has been traced to 19th-century Wales, where the original phrase was "Eat an apple on going to bed, and you'll keep the doctor from earning his bread".[113] In the 19th century and early 20th, the phrase evolved to "an apple a day, no doctor to pay" and "an apple a day sends the doctor away"; the phrasing now commonly used was first recorded in 1922.[114] +

+

See also

+ +

References

+
+
    +
  1. ^ Jump up to: a b c d e f g h i j k Dickson, Elizabeth E. (28 May 2021). "Malus domestica". Flora of North America. Archived from the original on 28 July 2024. Retrieved 27 July 2024. +
  2. +
  3. ^ Jump up to: a b c "Malus domestica (Suckow) Borkh". Plants of the World Online. Royal Botanic Gardens, Kew. Retrieved 31 July 2024. +
  4. +
  5. ^ Jump up to: a b Lim, Lisa (6 July 2021). "Where the word 'apple' came from and why the forbidden fruit was unlucky to be linked with the fall of man". Language Matters. South China Morning Post. Hong Kong, China: Alibaba Group. Archived from the original on 28 June 2023. Retrieved 28 June 2023. +
  6. +
  7. ^ "Origin and meaning of "apple" by Online Etymology Dictionary". Online Etymology Dictionary. Archived from the original on 21 December 2019. Retrieved 22 November 2019. +
  8. +
  9. ^ Jump up to: a b c d e f g Rieger, Mark. "Apple - Malus domestica". HORT 3020: Intro Fruit Crops. University of Georgia. Archived from the original on 21 January 2008. Retrieved 22 January 2008. +
  10. +
  11. ^ Jump up to: a b c "Apples - Malus domestica". North Carolina Extension Gardener Plant Toolbox. North Carolina State University. Archived from the original on 31 May 2024. Retrieved 31 July 2024. +
  12. +
  13. ^ Jump up to: a b c d Heil, Kenneth D.; O'Kane, Jr., Steve L.; Reeves, Linda Mary; Clifford, Arnold (2013). Flora of the Four Corners Region: Vascular Plants of the San Juan River Drainage, Arizona, Colorado, New Mexico, and Utah (First ed.). St. Louis, Missouri: Missouri Botanical Garden. p. 909. ISBN 978-1-930723-84-9. ISSN 0161-1542. LCCN 2012949654. OCLC 859541992. Retrieved 27 July 2024. +
  14. +
  15. ^ Jump up to: a b c d e Lim, Tong Kwee (2012). "Malus x domestica". Edible Medicinal and Non-Medicinal Plants. Vol. 4, Fruit (First ed.). Dordrecht, the Netherlands: Springer. pp. 414–415. doi:10.1007/978-94-007-4053-2_49. ISBN 978-94-007-4053-2. OCLC 795503871. +
  16. +
  17. ^ Juniper, Barrie E.; Mabberley, David J. (2006). The Story of the Apple (First ed.). Portland, Oregon: Timber Press. p. 27. ISBN 978-0-88192-784-9. LCCN 2006011869. OCLC 67383484. Retrieved 1 August 2024. +
  18. +
  19. ^ "Fruit glossary". Royal Horticultural Society. Archived from the original on 7 August 2024. Retrieved 7 August 2024. +
  20. +
  21. ^ Burford, Tom (2013). Apples of North America : 192 Exceptional Varieties for Gardeners, Growers and Cooks (First ed.). Portland, Oregon: Timber Press. pp. 22, 50, 55, 122, 123, 137, 141, 147, 159, 245, 246. ISBN 978-1-60469-249-5. LCCN 2012045130. OCLC 819860825. +
  22. +
  23. ^ "Shape". Western Agricultural Research Center. Montana State University. Archived from the original on 23 April 2024. Retrieved 30 July 2024. +
  24. +
  25. ^ Jump up to: a b Janick, Jules; Cummins, James N.; Brown, Susan K.; Hemmat, Minou (1996). "Chapter 1: Apples" (PDF). Fruit Breeding. Vol. I: Tree and Tropical Fruits. New York: John Wiley & Sons. pp. 9, 48. ISBN 978-0-471-31014-3. LCCN 95016407. OCLC 1302621533. Archived (PDF) from the original on 19 July 2013. Retrieved 30 August 2024. +
  26. +
  27. ^ "Natural Waxes on Fruits". Postharvest.tfrec.wsu.edu. 29 October 2010. Archived from the original on 24 May 2013. Retrieved 14 June 2013. +
  28. +
  29. ^ Flath, R. A.; Black, D. R.; Forrey, R. R.; McDonald, G. M.; Mon, T. R.; Teranishi, R. (1 August 1969). "Volatiles in Gravenstein Apple Essence Identified by GC-Mass Spectrometry". Journal of Chromatographic Science. 7 (8): 508. doi:10.1093/CHROMSCI/7.8.508. +
  30. +
  31. ^ Flath, Robert A.; Black, Dale Robert.; Guadagni, Dante G.; McFadden, William H.; Schultz, Thomas H. (January 1967). "Identification and organoleptic evaluation of compounds in Delicious apple essence". Journal of Agricultural and Food Chemistry. 15 (1): 29. doi:10.1021/jf60149a032. +
  32. +
  33. ^ Jump up to: a b Qian, Guan-Ze; Liu, Lian-Fen; Tang, Geng-Guo (April 2010). "(1933) Proposal to conserve the name Malus domestica against M. pumila, M. communis, M. frutescens, and Pyrus dioica ( Rosaceae )". Taxon. 59 (2): 650–652. doi:10.1002/tax.592038. +
  34. +
  35. ^ Applequist, Wendy L. (2017). "Report of the Nomenclature Committee for Vascular Plants: 69" (PDF). Taxon. 66 (2): 500–513. doi:10.12705/662.17. Archived (PDF) from the original on 7 May 2024. +
  36. +
  37. ^ Wilson, Karen L. (June 2017). "Report of the General Committee: 18". Taxon. 66 (3): 742. doi:10.12705/663.15. +
  38. +
  39. ^ Jump up to: a b Velasco, Riccardo; Zharkikh, Andrey; Affourtit, Jason; Dhingra, Amit; Cestaro, Alessandro; et al. (2010). "The genome of the domesticated apple (Malus × domestica Borkh.)". Nature Genetics. 42 (10): 833–839. doi:10.1038/ng.654. PMID 20802477. S2CID 14854514. +
  40. +
  41. ^ Di Pierro, Erica A.; Gianfranceschi, Luca; Di Guardo, Mario; Koehorst-Van Putten, Herma J.J.; Kruisselbrink, Johannes W.; et al. (2016). "A high-density, multi-parental SNP genetic map on apple validates a new mapping approach for outcrossing species". Horticulture Research. 3 (1): 16057. Bibcode:2016HorR....316057D. doi:10.1038/hortres.2016.57. PMC 5120355. PMID 27917289. +
  42. +
  43. ^ Jump up to: a b Daccord, Nicolas; Celton, Jean-Marc; Linsmith, Gareth; et al. (2017). "High-quality de novo assembly of the apple genome and methylome dynamics of early fruit development". Nature Genetics. 49 (7). Nature Communications: 1099–1106. doi:10.1038/ng.3886. hdl:10449/42064. PMID 28581499. S2CID 24690391. +
  44. +
  45. ^ Jump up to: a b Zhang, Liyi; Hu, Jiang; Han, Xiaolei; Li, Jingjing; Gao, Yuan; et al. (2019). "A high-quality apple genome assembly reveals the association of a retrotransposon and red fruit colour". Nature Communications. 10 (1). Nature Genetics: 1494. Bibcode:2019NatCo..10.1494Z. doi:10.1038/s41467-019-09518-x. PMC 6445120. PMID 30940818. +
  46. +
  47. ^ Jump up to: a b c d e Duan, Naibin; Bai, Yang; Sun, Honghe; Wang, Nan; Ma, Yumin; et al. (2017). "Genome re-sequencing reveals the history of apple and supports a two-stage model for fruit enlargement". Nature Communications. 8 (1): 249. Bibcode:2017NatCo...8..249D. doi:10.1038/s41467-017-00336-7. PMC 5557836. PMID 28811498. +
  48. +
  49. ^ Richards, Christopher M.; Volk, Gayle M.; Reilley, Ann A.; Henk, Adam D.; Lockwood, Dale R.; et al. (2009). "Genetic diversity and population structure in Malus sieversii, a wild progenitor species of domesticated apple". Tree Genetics & Genomes. 5 (2): 339–347. doi:10.1007/s11295-008-0190-9. S2CID 19847067. +
  50. +
  51. ^ Lauri, Pierre-éric; Maguylo, Karen; Trottier, Catherine (March 2006). "Architecture and size relations: an essay on the apple (Malus × domestica, Rosaceae) tree". American Journal of Botany. 93 (3): 357–368. doi:10.3732/ajb.93.3.357. PMID 21646196. Archived from the original on 20 April 2019. Retrieved 27 July 2024. +
  52. +
  53. ^ Cornille, Amandine; Gladieux, Pierre; Smulders, Marinus J. M.; Roldán-Ruiz, Isabel; Laurens, François; et al. (2012). Mauricio, Rodney (ed.). "New Insight into the History of Domesticated Apple: Secondary Contribution of the European Wild Apple to the Genome of Cultivated Varieties". PLOS Genetics. 8 (5): e1002703. doi:10.1371/journal.pgen.1002703. PMC 3349737. PMID 22589740. +
  54. +
  55. ^ Kean, Sam (17 May 2012). "ScienceShot: The Secret History of the Domesticated Apple". Archived from the original on 11 June 2016. +
  56. +
  57. ^ Coart, E.; Van Glabeke, S.; De Loose, M.; Larsen, A.S.; Roldán-Ruiz, I. (2006). "Chloroplast diversity in the genus Malus: new insights into the relationship between the European wild apple (Malus sylvestris (L.) Mill.) and the domesticated apple (Malus domestica Borkh.)". Mol. Ecol. 15 (8): 2171–2182. Bibcode:2006MolEc..15.2171C. doi:10.1111/j.1365-294x.2006.02924.x. PMID 16780433. S2CID 31481730. +
  58. +
  59. ^ Rottoli, Mauro; Pessina, Andrea (2007). "Chapter 9: Neolithic agriculture in Italy: an update of archaeobotanical data with particular emphasis on northern settlements". In Colledge, Sue; Conolly, James (eds.). The Origins and Spread of Domestic Plants in Southwest Asia and Europe (First ed.). Walnut Creek, California: Left Coast Press; University College London Institute of Archaeology Publications. pp. 142–143. ISBN 978-1-59874-988-5. OCLC 84838157. +
  60. +
  61. ^ Jump up to: a b c d Schlumbaum, Angela; van Glabeke, Sabine; Roldan-Ruiz, Isabel (January 2012). "Towards the onset of fruit tree growing north of the Alps: Ancient DNA from waterlogged apple (Malus sp.) seed fragments". Annals of Anatomy - Anatomischer Anzeiger. 194 (1): 157–162. doi:10.1016/j.aanat.2011.03.004. PMID 21501956. +
  62. +
  63. ^ Sauer, Jonathan D. (1993). Historical Geography of Crop Plants: A Select Roster (First ed.). Boca Raton, Florida: CRC Press. pp. 109–113. ISBN 978-0-8493-8901-6. LCCN 92045590. OCLC 27224696. +
  64. +
  65. ^ Plinius, Gaius Secundus (1855). The Natural History of Pliny. Vol. III. Translated by Bostock, John; Riley, Henry T. London: Henry G. Bohn. p. 303. Retrieved 3 August 2024. +
  66. +
  67. ^ Martin, Alice A. (1976). All About Apples (First ed.). Boston, Massachusetts: Houghton Mifflin Company. pp. 64–65. ISBN 978-0-395-20724-6. OCLC 1733691. Retrieved 3 August 2024. +
  68. +
  69. ^ Adamson, Melitta Weiss (2004). Food in Medieval Times (First ed.). Westport, Connecticut: Greenwood Press. pp. 19–20. ISBN 978-0-313-32147-4. LCCN 2004014054. OCLC 55738647. +
  70. +
  71. ^ Torrejón, Fernando; Cisternas, Marco; Araneda, Alberto (2004). "Efectos ambientales de la colonización española desde el río Maullín al archipiélago de Chiloé, sur de Chile" [Environmental effects of the spanish colonization from de Maullín river to the Chiloé archipelago, southern Chile]. Revista Chilena de Historia Natural (in Spanish). 77 (4): 661–677. doi:10.4067/s0716-078x2004000400009. +
  72. +
  73. ^ Smith, Archibald William (1963). A Gardener's Book of Plant Names : A Handbook of the Meaning and Origins of Plant Names (First ed.). New York: Harper & Row. p. 40. LCCN 62009906. OCLC 710612. Retrieved 10 August 2024. +
  74. +
  75. ^ Jump up to: a b c Poole, Mike (1980). "Heirloom Apples". In Lawrence, James (ed.). The Harrowsmith Reader Volume II. Camden East, Ontario: Camden House Publishing. p. 122. ISBN 978-0-920656-11-2. OCLC 1336124440. Retrieved 10 August 2024. +
  76. +
  77. ^ Van Valen, James M. (1900). History of Bergen County, New Jersey. New York: New Jersey Publishing and Engraving Company. pp. 33–34. OCLC 25697876. Retrieved 9 August 2024. +
  78. +
  79. ^ Brox, Jane (1999). Five Thousand Days Like This One (First ed.). Boston, Massachusetts: Beacon Press. pp. 150–151. ISBN 978-0-8070-2106-4. LCCN 98035051. OCLC 39605684. Retrieved 9 August 2024. +
  80. +
  81. ^ Cohen, Rachel D. (26 November 2018). "Thanks To Science, You Can Eat An Apple Every Day". The Salt. NPR. Archived from the original on 18 June 2024. Retrieved 1 August 2024. +
  82. +
  83. ^ "The Heirloom Apple Orchard". The Jentsch Lab. Cornell University. Archived from the original on 30 July 2024. Retrieved 9 August 2024. +
  84. +
  85. ^ Ranney, Thomas G. "Polyploidy: From Evolution to Landscape Plant Improvement". Proceedings of the 11th Metropolitan Tree Improvement Alliance (METRIA) Conference. 11th Metropolitan Tree Improvement Alliance Conference held in Gresham, Oregon, August 23–24, 2000. METRIA (NCSU.edu). METRIA. Archived from the original on 23 July 2010. Retrieved 7 November 2010. +
  86. +
  87. ^ Lord, William G.; Ouellette, Amy (February 2010). "Dwarf Rootstocks for Apple Trees in the Home Garden" (PDF). University of New Hampshire. Archived from the original (PDF) on 30 September 2013. Retrieved 1 September 2013. +
  88. +
  89. ^ Fallahi, Esmaeil; Colt, W. Michael; Fallahi, Bahar; Chun, Ik-Jo (January 2002). "The Importance of Apple Rootstocks on Tree Growth, Yield, Fruit Quality, Leaf Nutrition, and Photosynthesis with an Emphasis on 'Fuji'". HortTechnology. 12 (1): 38–44. doi:10.21273/HORTTECH.12.1.38. Archived (PDF) from the original on 11 February 2014. Retrieved 9 August 2024. +
  90. +
  91. ^ Parker, M.L. (September 1993). "Apple Rootstocks and Tree Spacing". North Carolina Cooperative Extension Service. Archived from the original on 11 September 2013. Retrieved 1 September 2013. +
  92. +
  93. ^ Ferree, David Curtis; Warrington, Ian J. (2003). Apples: Botany, Production, and Uses. New York: Centre for Agriculture and Bioscience International. pp. 33–35. ISBN 978-0851995922. OCLC 133167834. +
  94. +
  95. ^ Jump up to: a b c d Polomski, Bob; Reighard, Greg. "Apple HGIC 1350". Home & Garden Information Center. Clemson University. Archived from the original on 28 February 2008. Retrieved 22 January 2008. +
  96. +
  97. ^ Barahona, M. (1992). "Adaptation of Apple Varieties in Ecuador". Acta Horticulturae (310): 135–142. doi:10.17660/ActaHortic.1992.310.17. +
  98. +
  99. ^ Adamson, Nancy Lee (2011). An Assessment of Non-Apis Bees as Fruit and Vegetable Crop Pollinators in Southwest Virginia (PDF) (Doctor of Philosophy in Entomology thesis). Virginia Polytechnic Institute and State University. Archived (PDF) from the original on 20 November 2015. Retrieved 15 October 2015. +
  100. +
  101. ^ Powell, L.E. (1986). "The Chilling Requirement in Apple and Its Role in Regulating Time of Flowering in Spring in Cold-Winter Climate". Acta Horticulturae (179). Wageningen, Netherlands: International Society for Horticultural Science: 129–140. doi:10.17660/ActaHortic.1986.179.10. ISBN 978-90-6605-182-9. +
  102. +
  103. ^ Romano, Andrea (10 September 2023). "20 Best Places to Go Apple Picking in the United States". Travel + Leisure. Archived from the original on 21 April 2024. Retrieved 2 August 2024. +
  104. +
  105. ^ Graziano, Jack; Farcuh, Macarena (10 September 2021). "Controlled Atmosphere Storage of Apples". University of Maryland Extension. Archived from the original on 24 March 2023. Retrieved 2 August 2024. +
  106. +
  107. ^ "FoodKeeper App". FoodSafety.gov. United States Department of Health and Human Services. 26 April 2019. Retrieved 17 September 2024. +
  108. +
  109. ^ "4 Steps to Food Safety". FoodSafety.gov. United States Department of Health and Human Services. 12 April 2019. Retrieved 17 September 2024. +
  110. +
  111. ^ "Refrigerated storage of perishable foods". CSIRO. 26 February 2015. Archived from the original on 15 March 2015. Retrieved 25 May 2007. +
  112. +
  113. ^ Karp, David (25 October 2006). "Puff the Magic Preservative: Lasting Crunch, but Less Scent". The New York Times. Archived from the original on 3 August 2011. Retrieved 26 July 2017. +
  114. +
  115. ^ Jackson, H.S. (1914). "Powdery Mildew". In Lowther, Granville; Worthington, William (eds.). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables. Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 475–476. Retrieved 1 August 2024. +
  116. +
  117. ^ Lowther, Granville; Worthington, William, eds. (1914). The Encyclopedia of Practical Horticulture: A Reference System of Commercial Horticulture, Covering the Practical and Scientific Phases of Horticulture, with Special Reference to Fruits and Vegetables. Vol. I. North Yakima, Washington: The Encyclopedia of Horticulture Corporation. pp. 45–51. Retrieved 1 August 2024. +
  118. +
  119. ^ Coli, William M.; Los, Lorraine M., eds. (2003). "Insect Pests". 2003-2004 New England Apple Pest Management Guide. University of Massachusetts Amherst. pp. 28–29. Archived from the original on 12 February 2008. Retrieved 3 March 2008.{{cite book}}: CS1 maint: bot: original URL status unknown (link) +
  120. +
  121. ^ Jump up to: a b Atthowe, Helen; Gilkeson, Linda A.; Kite, L. Patricia; Michalak, Patricia S.; Pleasant, Barbara; Reich, Lee; Scheider, Alfred F. (2009). Bradley, Fern Marshall; Ellis, Bardara W.; Martin, Deborah L. (eds.). The Organic Gardener's Handbook of Natural Pest and Disease Control. New York: Rodale, Inc. pp. 32–34. ISBN 978-1-60529-677-7. LCCN 2009039996. OCLC 419860680. +
  122. +
  123. ^ Coli, William M.; Berkett, Lorraine P.; Spitko, Robin, eds. (2003). "Other Apple Diseases". 2003-2004 New England Apple Pest Management Guide. University of Massachusetts Amherst. pp. 19–27. Archived from the original on 12 February 2008. Retrieved 3 March 2008.{{cite book}}: CS1 maint: bot: original URL status unknown (link) +
  124. +
  125. ^ Martin, Phillip L.; Krawczyk, Teresa; Khodadadi, Fatemeh; Aćimović, Srđan G.; Peter, Kari A. (2021). "Bitter Rot of Apple in the Mid-Atlantic United States: Causal Species and Evaluation of the Impacts of Regional Weather Patterns and Cultivar Susceptibility". Phytopathology. 111 (6): 966–981. doi:10.1094/PHYTO-09-20-0432-R. ISSN 0031-949X. PMID 33487025. S2CID 231701083. +
  126. +
  127. ^ Erler, Fedai (1 January 2010). "Efficacy of tree trunk coating materials in the control of the apple clearwing, Synanthedon myopaeformis". Journal of Insect Science. 10 (1): 63. doi:10.1673/031.010.6301. PMC 3014806. PMID 20672979. +
  128. +
  129. ^ Elzebroek, A. T. G.; Wind, Koop (2008). Guide to Cultivated Plants. Wallingford, United Kingdom: CABI. p. 27. ISBN 978-1-84593-356-2. LCCN 2007028459. OCLC 156975183. Archived from the original on 20 October 2020. Retrieved 6 October 2020. +
  130. +
  131. ^ Jump up to: a b "Apple – Malus domestica". Natural England. Archived from the original on 12 May 2008. Retrieved 22 January 2008. +
  132. +
  133. ^ "Home". National Fruit Collection. Archived from the original on 15 June 2012. Retrieved 2 December 2012. +
  134. +
  135. ^ "ECPGR Malus/Pyrus Working Group Members". Ecpgr.cgiar.org. 22 July 2002. Archived from the original on 26 August 2014. Retrieved 25 August 2014. +
  136. +
  137. ^ Jump up to: a b Tarjan, Sue (Fall 2006). "Autumn Apple Musings" (PDF). News & Notes of the UCSC Farm & Garden, Center for Agroecology & Sustainable Food Systems. pp. 1–2. Archived from the original (PDF) on 11 August 2007. Retrieved 24 January 2008. +
  138. +
  139. ^ Beck, Kellen (17 October 2020). "How breeders bring out the best in new apples". Mashable. Archived from the original on 31 July 2024. Retrieved 31 July 2024. +
  140. +
  141. ^ Migicovsky, Zoë (22 August 2021). "How a few good apples spawned today's top varieties — and why breeders must branch out". The Conversation. Archived from the original on 31 July 2024. Retrieved 31 July 2024. +
  142. +
  143. ^ Peil, A.; Dunemann, F.; Richter, K.; Hoefer, M.; Király, I.; Flachowsky, H.; Hanke, M.-V. (2008). "Resistance Breeding in Apple at Dresden-Pillnitz". Ecofruit - 13th International Conference on Cultivation Technique and Phytopathological Problems in Organic Fruit-Growing: Proceedings to the Conference from 18thFebruary to 20th February 2008 at Weinsberg/Germany (in German): 220–225. Archived from the original on 28 January 2021. Retrieved 31 July 2024. +
  144. +
  145. ^ Jump up to: a b "World apple situation". Archived from the original on 11 February 2008. Retrieved 24 January 2008. +
  146. +
  147. ^ Weaver, Sue (June–July 2003). "Crops & Gardening – Apples of Antiquity". Hobby Farms Magazine. Archived from the original on 19 February 2017. +
  148. +
  149. ^ Jump up to: a b c "Apple production in 2022; from pick lists: Crops/World Regions/Production Quantity". FAOSTAT, UN Food and Agriculture Organization, Statistics Division. 2024. Archived from the original on 12 November 2016. Retrieved 18 June 2024. +
  150. +
  151. ^ Nelson, Lewis S.; Shih, Richard D.; Balick, Michael J. (2007). Handbook of Poisonous and Injurious Plants (Second ed.). New York: New York Botanical Garden : Springer. pp. 27, 211–212. ISBN 978-0387-31268-2. LCCN 2005938815. OCLC 77537459. Retrieved 11 September 2024. +
  152. +
  153. ^ "Amygdalin". Toxnet, US Library of Medicine. Archived from the original on 21 April 2017. Retrieved 20 April 2017. +
  154. +
  155. ^ Jump up to: a b c d e f "General Information – Apple". Informall. Archived from the original on 23 July 2012. Retrieved 17 October 2011. +
  156. +
  157. ^ Landau, Elizabeth, Oral allergy syndrome may explain mysterious reactions, 8 April 2009, CNN Health, accessed 17 October 2011 +
  158. +
  159. ^ United States Food and Drug Administration (2024). "Daily Value on the Nutrition and Supplement Facts Labels". FDA. Archived from the original on 27 March 2024. Retrieved 28 March 2024. +
  160. +
  161. ^ National Academies of Sciences, Engineering, and Medicine; Health and Medicine Division; Food and Nutrition Board; Committee to Review the Dietary Reference Intakes for Sodium and Potassium (2019). Oria, Maria; Harrison, Meghan; Stallings, Virginia A. (eds.). Dietary Reference Intakes for Sodium and Potassium. The National Academies Collection: Reports funded by National Institutes of Health. Washington, DC: National Academies Press (US). ISBN 978-0-309-48834-1. PMID 30844154. Archived from the original on 9 May 2024. Retrieved 21 June 2024. +
  162. +
  163. ^ Jump up to: a b c d Davidson, Alan (2014). "Apple". In Jaine, Tom (ed.). The Oxford Companion to Food. Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press. pp. 27–31. ISBN 978-0-19-967733-7. LCCN 2013957569. OCLC 890807357. OL 27172691M. Retrieved 18 September 2024. +
  164. +
  165. ^ Traverso, Amy (2011). The Apple Lover's Cookbook. Photographs by Squire Fox (First ed.). New York: W.W. Norton & Company. pp. 16, 32, 35, 45, 92, 137, 262–263, 275. ISBN 978-0-393-06599-2. LCCN 2011016560. OCLC 711051767. OL 16450839W. +
  166. +
  167. ^ Kellogg, Kristi (15 January 2015). "81 Best Apple Recipes: Dinners, Desserts, Salads, and More". Epicurious. Archived from the original on 18 October 2020. Retrieved 17 October 2020. +
  168. +
  169. ^ Davidson, Alan (2014). "Toffee Apple". In Jaine, Tom (ed.). The Oxford Companion to Food. Illustrated by Soun Vannithone (Third ed.). Oxford: Oxford University Press. p. 824. ISBN 978-0-19-967733-7. LCCN 2013957569. OCLC 890807357. OL 27172691M. Retrieved 18 September 2024. +
  170. +
  171. ^ Shurpin, Yehuda. "Why All the Symbolic Rosh Hashanah Foods? "בולבול"". Chabad.org. Archived from the original on 21 March 2023. Retrieved 21 March 2023. +
  172. +
  173. ^ Yepsen, Roger B. (2017) [1994]. Apples (Revised and Updated ed.). New York: W.W. Norton & Company. p. 52. ISBN 978-1-68268-019-3. LCCN 2017010136. OCLC 973918728. +
  174. +
  175. ^ "Organic apples". USDA Agricultural Marketing Service. February 2016. Archived from the original on 24 February 2017. Retrieved 23 February 2017. +
  176. +
  177. ^ Jump up to: a b "European Organic Apple Production Demonstrates the Value of Pesticides" (PDF). CropLife Foundation, Washington, DC. December 2011. Archived (PDF) from the original on 24 February 2017. Retrieved 23 February 2017. +
  178. +
  179. ^ Ribeiro, Flávia A.P.; Gomes de Moura, Carolina F.; Aguiar, Odair; de Oliveira, Flavia; Spadari, Regina C.; Oliveira, Nara R.C.; Oshima, Celina T.F.; Ribeiro, Daniel A. (September 2014). "The chemopreventive activity of apple against carcinogenesis: antioxidant activity and cell cycle control". European Journal of Cancer Prevention (Review). 23 (5): 477–480. doi:10.1097/CEJ.0000000000000005. PMID 24366437. S2CID 23026644. +
  180. +
  181. ^ Nicolas, J. J.; Richard-Forget, F. C.; Goupy, P. M.; Amiot, M. J.; Aubert, S. Y. (1 January 1994). "Enzymatic browning reactions in apple and apple products". Critical Reviews in Food Science and Nutrition. 34 (2): 109–157. doi:10.1080/10408399409527653. PMID 8011143. +
  182. +
  183. ^ "PPO silencing". Okanagan Specialty Fruits. 2019. Archived from the original on 27 April 2021. Retrieved 14 November 2019. +
  184. +
  185. ^ "United States: GM non-browning Arctic apple expands into foodservice". Fresh Fruit Portal. 13 August 2019. Archived from the original on 27 June 2021. Retrieved 14 November 2019. +
  186. +
  187. ^ "Okanagan Specialty Fruits: Biotechnology Consultation Agency Response Letter BNF 000132". U.S. Food and Drug Administration. 20 March 2015. Archived from the original on 31 October 2017. Retrieved 14 November 2019. +
  188. +
  189. ^ "Questions and answers: Arctic Apple". Canadian Food Inspection Agency, Government of Canada. 8 September 2017. Archived from the original on 19 September 2018. Retrieved 14 November 2019. +
  190. +
  191. ^ Yu, Xiuzhu; Van De Voort, Frederick R.; Li, Zhixi; Yue, Tianli (2007). "Proximate Composition of the Apple Seed and Characterization of Its Oil". International Journal of Food Engineering. 3 (5). doi:10.2202/1556-3758.1283. S2CID 98590230. +
  192. +
  193. ^ Jump up to: a b c Davidson, Hilda Roderick Ellis (1990) [1st pub. 1964]. Gods and Myths of Northern Europe. London: Penguin Books. pp. 165–166. ISBN 0-14-013627-4. OCLC 29336401. +
  194. +
  195. ^ Davidson, Hilda Ellis (1998). Roles of the Northern Goddess. London; New York: Routledge. pp. 146–147. doi:10.4324/9780203025550. ISBN 0-415-13610-5. LCCN 97018309. OCLC 48138055. +
  196. +
  197. ^ Biedermann, Hans (1992). Dictionary of Symbolism. Translated by Hulbert, James. New York: Facts on File. pp. 16–17. ISBN 978-0-8160-2593-0. LCCN 91044933. OCLC 25092926. Retrieved 3 October 2024. +
  198. +
  199. ^ Jump up to: a b Ruck, Carl A. P.; Staples, Blaise D.; Heinrich, Clark (2001). The apples of Apollo : pagan and Christian mysteries of the Eucharist. Durham, North Carolina: Carolina Academic Press. pp. 64–70. ISBN 978-0-89089-924-3. LCCN 00040351. OCLC 46337324. +
  200. +
  201. ^ "Eris - Greek Goddess of Strife & Discord (Roman Discordia)". Theoi Project. Aaron J. Atsma. Archived from the original on 25 September 2024. Retrieved 26 September 2024. +
  202. +
  203. ^ Lucian (1905). The Works of Lucian of Samosata. Vol. I. Translated by Fowler, H.W.; Fowler, F.G. (First ed.). Oxford: Clarendon Press. pp. 78–85. LCCN 06001045. OCLC 506365. Retrieved 26 September 2024. +
  204. +
  205. ^ "Judgement of Paris - Greek Mythology". Theoi Project. Aaron J. Atsma. Archived from the original on 24 August 2024. Retrieved 26 September 2024. +
  206. +
  207. ^ Plato (1997). "Epigrams". In Cooper, John M.; Hutchinson, D.S. (eds.). Complete Works. Translated by Edmonds, J.M.; Cooper, John M. Indianapolis, Indiana: Hackett Publishing. p. 1744. ISBN 0-87220-349-2. LCCN 96053280. OCLC 36178550. Retrieved 27 September 2024. +
  208. +
  209. ^ Pinsent, John (1969). Greek Mythology (First ed.). London: Paul Hamlyn. p. 79. ISBN 978-0-600-02422-4. LCCN 78449216. OCLC 61702. Retrieved 3 October 2024. +
  210. +
  211. ^ "Atalanta (Atalante) - Arcadian Heroine of Greek Mythology". Theoi Project. Aaron J. Atsma. Archived from the original on 27 September 2024. Retrieved 3 October 2024. +
  212. +
  213. ^ Flieger, Verlyn (2005). Interrupted Music : The Making of Tolkien's Mythology. Kent, Ohio: Kent State University Press. pp. 122–123. ISBN 978-0-87338-824-5. LCCN 2004024490. OCLC 56805947. +
  214. +
  215. ^ Jump up to: a b "Why Do the Chinese Give Apples Around Christmas?". Teach English In China. 22 December 2019. Archived from the original on 1 October 2020. Retrieved 3 September 2024. +
  216. +
  217. ^ Jump up to: a b Macrone, Michael (1998). Brush up your Bible!. New York: Gramercy Books. pp. 15–16, 340–341. ISBN 978-0-517-20189-3. OCLC 38270894. Retrieved 31 July 2024. +
  218. +
  219. ^ Kissling, Paul J. (2004). Genesis. Vol. 1. Joplin, Missouri: College Press. p. 193. ISBN 978-0-89900-875-2. LCCN 2004022577. OCLC 56672257. Archived from the original on 26 January 2021. Retrieved 6 October 2020. +
  220. +
  221. ^ Genesis 2:17 +
  222. +
  223. ^ Hendel, Ronald S. (2013). The Book of Genesis: A Biography. Princeton, New Jersey: Princeton University Press. p. 114. ISBN 978-0-69114012-4. LCCN 2012015634. OCLC 788265521. Archived from the original on 5 March 2023. Retrieved 4 October 2024. +
  224. +
  225. ^ Mieder, Wolfgang; Kingsbury, Stewart A.; Harder, Kelsie B., eds. (1996) [1992]. A Dictionary of American Proverbs (Paperback ed.). New York: Oxford University Press. p. 23. ISBN 978-0-19-511133-0. LCCN 91015508. OCLC 23693799. Retrieved 23 August 2024. +
  226. +
  227. ^ Pollan, Michael (2001). The Botany of Desire: A Plant's-Eye View of the World (First ed.). New York: Random House. pp. 9, 22, 50. ISBN 978-0-375-50129-6. LCCN 00066479. OCLC 49803415. +
  228. +
+

Further reading

+ +
+
  • Media related to Apples at Wikimedia Commons
+ + + + + + + + + + + +
+
+ +
+
+ +
+ +
+
+
+ +
+ + +
\ No newline at end of file diff --git a/tests/async/test_async_doanloader.py b/tests/async/test_async_doanloader.py new file mode 100644 index 00000000..4798b4ca --- /dev/null +++ b/tests/async/test_async_doanloader.py @@ -0,0 +1,229 @@ +import os +import sys +import asyncio +import shutil +from typing import List +import tempfile +import time + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.async_webcrawler import AsyncWebCrawler + +class TestDownloads: + def __init__(self): + self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_") + self.download_dir = os.path.join(self.temp_dir, "downloads") + os.makedirs(self.download_dir, exist_ok=True) + self.results: List[str] = [] + + def cleanup(self): + shutil.rmtree(self.temp_dir) + + def log_result(self, test_name: str, success: bool, message: str = ""): + result = f"{'✅' if success else '❌'} {test_name}: {message}" + self.results.append(result) + print(result) + + async def test_basic_download(self): + """Test basic file download functionality""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + # Python.org downloads page typically has stable download links + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click first download link + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Basic Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Basic Download", False, str(e)) + + async def test_persistent_context_download(self): + """Test downloads with persistent context""" + try: + user_data_dir = os.path.join(self.temp_dir, "user_data") + os.makedirs(user_data_dir, exist_ok=True) + + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + use_persistent_context=True, + user_data_dir=user_data_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + "Persistent Context Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result("Persistent Context Download", False, str(e)) + + async def test_multiple_downloads(self): + """Test multiple simultaneous downloads""" + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + // Click multiple download links + const downloadLinks = document.querySelectorAll('a[href$=".exe"]'); + downloadLinks.forEach(link => link.click()); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 1 + self.log_result( + "Multiple Downloads", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded" + ) + except Exception as e: + self.log_result("Multiple Downloads", False, str(e)) + + async def test_different_browsers(self): + """Test downloads across different browser types""" + browsers = ["chromium", "firefox", "webkit"] + + for browser_type in browsers: + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path=self.download_dir, + browser_type=browser_type, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code=""" + const downloadLink = document.querySelector('a[href$=".exe"]'); + if (downloadLink) downloadLink.click(); + """ + ) + + success = result.downloaded_files is not None and len(result.downloaded_files) > 0 + self.log_result( + f"{browser_type.title()} Download", + success, + f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded" + ) + except Exception as e: + self.log_result(f"{browser_type.title()} Download", False, str(e)) + + async def test_edge_cases(self): + """Test various edge cases""" + + # Test 1: Downloads without specifying download path + try: + async with AsyncWebCrawler( + accept_downloads=True, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result( + "Default Download Path", + True, + f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}" + ) + except Exception as e: + self.log_result("Default Download Path", False, str(e)) + + # Test 2: Downloads with invalid path + try: + async with AsyncWebCrawler( + accept_downloads=True, + downloads_path="/invalid/path/that/doesnt/exist", + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + self.log_result("Invalid Download Path", False, "Should have raised an error") + except Exception as e: + self.log_result("Invalid Download Path", True, "Correctly handled invalid path") + + # Test 3: Download with accept_downloads=False + try: + async with AsyncWebCrawler( + accept_downloads=False, + verbose=True + ) as crawler: + result = await crawler.arun( + url="https://www.python.org/downloads/", + js_code="document.querySelector('a[href$=\".exe\"]').click()" + ) + success = result.downloaded_files is None + self.log_result( + "Disabled Downloads", + success, + "Correctly ignored downloads" if success else "Unexpectedly downloaded files" + ) + except Exception as e: + self.log_result("Disabled Downloads", False, str(e)) + + async def run_all_tests(self): + """Run all test cases""" + print("\n🧪 Running Download Tests...\n") + + test_methods = [ + self.test_basic_download, + self.test_persistent_context_download, + self.test_multiple_downloads, + self.test_different_browsers, + self.test_edge_cases + ] + + for test in test_methods: + print(f"\n📝 Running {test.__doc__}...") + await test() + await asyncio.sleep(2) # Brief pause between tests + + print("\n📊 Test Results Summary:") + for result in self.results: + print(result) + + successes = len([r for r in self.results if '✅' in r]) + total = len(self.results) + print(f"\nTotal: {successes}/{total} tests passed") + + self.cleanup() + +async def main(): + tester = TestDownloads() + await tester.run_all_tests() + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tests/async/test_content_filter.py b/tests/async/test_content_filter.py new file mode 100644 index 00000000..a873c414 --- /dev/null +++ b/tests/async/test_content_filter.py @@ -0,0 +1,175 @@ +import os, sys +import pytest +from bs4 import BeautifulSoup +from typing import List + +# Add the parent directory to the Python path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from crawl4ai.content_filter_strategy import BM25ContentFilter + +@pytest.fixture +def basic_html(): + return """ + + + Test Article + + + + +

Main Heading

+
+

This is a long paragraph with more than fifty words. It continues with more text to ensure we meet the minimum word count threshold. We need to make sure this paragraph is substantial enough to be considered for extraction according to our filtering rules. This should be enough words now.

+ +
+ + + """ + +@pytest.fixture +def wiki_html(): + return """ + + + Wikipedia Article + + +

Article Title

+

Section 1

+

Short but important section header description.

+
+

Long paragraph with sufficient words to meet the minimum threshold. This paragraph continues with more text to ensure we have enough content for proper testing. We need to make sure this has enough words to pass our filters and be considered valid content for extraction purposes.

+
+ + + """ + +@pytest.fixture +def no_meta_html(): + return """ + + +

Simple Page

+

First paragraph that should be used as fallback for query when no meta tags exist. This text needs to be long enough to serve as a meaningful fallback for our content extraction process.

+ + + """ + +class TestBM25ContentFilter: + def test_basic_extraction(self, basic_html): + """Test basic content extraction functionality""" + filter = BM25ContentFilter() + contents = filter.filter_content(basic_html) + + assert contents, "Should extract content" + assert len(contents) >= 1, "Should extract at least one content block" + assert "long paragraph" in ' '.join(contents).lower() + assert "navigation" not in ' '.join(contents).lower() + + def test_user_query_override(self, basic_html): + """Test that user query overrides metadata extraction""" + user_query = "specific test query" + filter = BM25ContentFilter(user_query=user_query) + + # Access internal state to verify query usage + soup = BeautifulSoup(basic_html, 'lxml') + extracted_query = filter.extract_page_query(soup.find('head')) + + assert extracted_query == user_query + assert "Test description" not in extracted_query + + def test_header_extraction(self, wiki_html): + """Test that headers are properly extracted despite length""" + filter = BM25ContentFilter() + contents = filter.filter_content(wiki_html) + + combined_content = ' '.join(contents).lower() + assert "section 1" in combined_content, "Should include section header" + assert "article title" in combined_content, "Should include main title" + + def test_no_metadata_fallback(self, no_meta_html): + """Test fallback behavior when no metadata is present""" + filter = BM25ContentFilter() + contents = filter.filter_content(no_meta_html) + + assert contents, "Should extract content even without metadata" + assert "First paragraph" in ' '.join(contents), "Should use first paragraph content" + + def test_empty_input(self): + """Test handling of empty input""" + filter = BM25ContentFilter() + assert filter.filter_content("") == [] + assert filter.filter_content(None) == [] + + def test_malformed_html(self): + """Test handling of malformed HTML""" + malformed_html = "

Unclosed paragraph

Nested content

" + filter = BM25ContentFilter() + contents = filter.filter_content(malformed_html) + + assert isinstance(contents, list), "Should return list even with malformed HTML" + + def test_threshold_behavior(self, basic_html): + """Test different BM25 threshold values""" + strict_filter = BM25ContentFilter(bm25_threshold=2.0) + lenient_filter = BM25ContentFilter(bm25_threshold=0.5) + + strict_contents = strict_filter.filter_content(basic_html) + lenient_contents = lenient_filter.filter_content(basic_html) + + assert len(strict_contents) <= len(lenient_contents), \ + "Strict threshold should extract fewer elements" + + def test_html_cleaning(self, basic_html): + """Test HTML cleaning functionality""" + filter = BM25ContentFilter() + contents = filter.filter_content(basic_html) + + cleaned_content = ' '.join(contents) + assert 'class=' not in cleaned_content, "Should remove class attributes" + assert 'style=' not in cleaned_content, "Should remove style attributes" + assert ' +
{'

Test content. ' * 1000}

+ + """ + filter = BM25ContentFilter() + contents = filter.filter_content(large_html) + assert contents, "Should handle large content blocks" + + @pytest.mark.parametrize("unwanted_tag", [ + 'script', 'style', 'nav', 'footer', 'header' + ]) + def test_excluded_tags(self, unwanted_tag): + """Test that specific tags are properly excluded""" + html = f""" + + <{unwanted_tag}>Should not appear +

Should appear

+ + """ + filter = BM25ContentFilter() + contents = filter.filter_content(html) + + combined_content = ' '.join(contents).lower() + assert "should not appear" not in combined_content + + def test_performance(self, basic_html): + """Test performance with timer""" + filter = BM25ContentFilter() + + import time + start = time.perf_counter() + filter.filter_content(basic_html) + duration = time.perf_counter() - start + + assert duration < 1.0, f"Processing took too long: {duration:.2f} seconds" + +if __name__ == "__main__": + pytest.main([__file__]) \ No newline at end of file diff --git a/tests/async/test_content_scraper_strategy.py b/tests/async/test_content_scraper_strategy.py new file mode 100644 index 00000000..62c49148 --- /dev/null +++ b/tests/async/test_content_scraper_strategy.py @@ -0,0 +1,162 @@ +import asyncio +from bs4 import BeautifulSoup +from typing import Dict, Any +import os +import sys +import time +import csv +from tabulate import tabulate +from dataclasses import dataclass +from typing import List, Dict + +parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +from crawl4ai.content_scraping_strategy import WebScrapingStrategy +from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent +# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent + +@dataclass +class TestResult: + name: str + success: bool + images: int + internal_links: int + external_links: int + markdown_length: int + execution_time: float + +class StrategyTester: + def __init__(self): + self.new_scraper = WebScrapingStrategy() + self.current_scraper = WebScrapingStrategyCurrent() + with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f: + self.WIKI_HTML = f.read() + self.results = {'new': [], 'current': []} + + def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]: + results = [] + for scraper in [self.new_scraper, self.current_scraper]: + start_time = time.time() + result = scraper._get_content_of_website_optimized( + url="https://en.wikipedia.org/wiki/Test", + html=self.WIKI_HTML, + **kwargs + ) + execution_time = time.time() - start_time + + test_result = TestResult( + name=name, + success=result['success'], + images=len(result['media']['images']), + internal_links=len(result['links']['internal']), + external_links=len(result['links']['external']), + markdown_length=len(result['markdown']), + execution_time=execution_time + ) + results.append(test_result) + + return results[0], results[1] # new, current + + def run_all_tests(self): + test_cases = [ + ("Basic Extraction", {}), + ("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}), + ("Word Threshold", {'word_count_threshold': 50}), + ("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}), + ("Link Exclusions", { + 'exclude_external_links': True, + 'exclude_social_media_links': True, + 'exclude_domains': ['facebook.com', 'twitter.com'] + }), + ("Media Handling", { + 'exclude_external_images': True, + 'image_description_min_word_threshold': 20 + }), + ("Text Only", { + 'only_text': True, + 'remove_forms': True + }), + ("HTML Cleaning", { + 'clean_html': True, + 'keep_data_attributes': True + }), + ("HTML2Text Options", { + 'html2text': { + 'skip_internal_links': True, + 'single_line_break': True, + 'mark_code': True, + 'preserve_tags': ['pre', 'code'] + } + }) + ] + + all_results = [] + for name, kwargs in test_cases: + try: + new_result, current_result = self.run_test(name, **kwargs) + all_results.append((name, new_result, current_result)) + except Exception as e: + print(f"Error in {name}: {str(e)}") + + self.save_results_to_csv(all_results) + self.print_comparison_table(all_results) + + def save_results_to_csv(self, all_results: List[tuple]): + csv_file = os.path.join(__location__, 'strategy_comparison_results.csv') + with open(csv_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', + 'External Links', 'Markdown Length', 'Execution Time']) + + for name, new_result, current_result in all_results: + writer.writerow([name, 'New', new_result.success, new_result.images, + new_result.internal_links, new_result.external_links, + new_result.markdown_length, f"{new_result.execution_time:.3f}"]) + writer.writerow([name, 'Current', current_result.success, current_result.images, + current_result.internal_links, current_result.external_links, + current_result.markdown_length, f"{current_result.execution_time:.3f}"]) + + def print_comparison_table(self, all_results: List[tuple]): + table_data = [] + headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', + 'External Links', 'Markdown Length', 'Time (s)'] + + for name, new_result, current_result in all_results: + # Check for differences + differences = [] + if new_result.images != current_result.images: differences.append('images') + if new_result.internal_links != current_result.internal_links: differences.append('internal_links') + if new_result.external_links != current_result.external_links: differences.append('external_links') + if new_result.markdown_length != current_result.markdown_length: differences.append('markdown') + + # Add row for new strategy + new_row = [ + name, 'New', new_result.success, new_result.images, + new_result.internal_links, new_result.external_links, + new_result.markdown_length, f"{new_result.execution_time:.3f}" + ] + table_data.append(new_row) + + # Add row for current strategy + current_row = [ + '', 'Current', current_result.success, current_result.images, + current_result.internal_links, current_result.external_links, + current_result.markdown_length, f"{current_result.execution_time:.3f}" + ] + table_data.append(current_row) + + # Add difference summary if any + if differences: + table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', '']) + + # Add empty row for better readability + table_data.append([''] * len(headers)) + + print("\nStrategy Comparison Results:") + print(tabulate(table_data, headers=headers, tablefmt='grid')) + +if __name__ == "__main__": + tester = StrategyTester() + tester.run_all_tests() \ No newline at end of file diff --git a/tests/async/test_markdown_genertor.py b/tests/async/test_markdown_genertor.py new file mode 100644 index 00000000..025a0318 --- /dev/null +++ b/tests/async/test_markdown_genertor.py @@ -0,0 +1,165 @@ +# ## Issue #236 +# - **Last Updated:** 2024-11-11 01:42:14 +# - **Title:** [user data crawling opens two windows, unable to control correct user browser](https://github.com/unclecode/crawl4ai/issues/236) +# - **State:** open + +import os, sys, time +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +__location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) +import asyncio +import os +import time +from typing import Dict, Any +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerationStrategy + +# Get current directory +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +def print_test_result(name: str, result: Dict[str, Any], execution_time: float): + """Helper function to print test results.""" + print(f"\n{'='*20} {name} {'='*20}") + print(f"Execution time: {execution_time:.4f} seconds") + + + # Save markdown to files + for key, content in result.items(): + if isinstance(content, str): + with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f: + f.write(content) + + # # Print first few lines of each markdown version + # for key, content in result.items(): + # if isinstance(content, str): + # preview = '\n'.join(content.split('\n')[:3]) + # print(f"\n{key} (first 3 lines):") + # print(preview) + # print(f"Total length: {len(content)} characters") + +def test_basic_markdown_conversion(): + """Test basic markdown conversion with links.""" + with open(__location__ + "/data/wikipedia.html", "r") as f: + cleaned_html = f.read() + + generator = DefaultMarkdownGenerationStrategy() + + start_time = time.perf_counter() + result = generator.generate_markdown( + cleaned_html=cleaned_html, + base_url="https://en.wikipedia.org" + ) + execution_time = time.perf_counter() - start_time + + print_test_result("Basic Markdown Conversion", { + 'raw': result.raw_markdown, + 'with_citations': result.markdown_with_citations, + 'references': result.references_markdown + }, execution_time) + + # Basic assertions + assert result.raw_markdown, "Raw markdown should not be empty" + assert result.markdown_with_citations, "Markdown with citations should not be empty" + assert result.references_markdown, "References should not be empty" + assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets" + assert "## References" in result.references_markdown, "Should contain references section" + +def test_relative_links(): + """Test handling of relative links with base URL.""" + markdown = """ + Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com). + Also an [image](/images/test.png) and another [page](/wiki/Banana). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://en.wikipedia.org" + ) + + assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown + assert "https://example.com" in result.references_markdown + assert "https://en.wikipedia.org/images/test.png" in result.references_markdown + +def test_duplicate_links(): + """Test handling of duplicate links.""" + markdown = """ + Here's a [link](/test) and another [link](/test) and a [different link](/other). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + # Count citations in markdown + citations = result.markdown_with_citations.count("⟨1⟩") + assert citations == 2, "Same link should use same citation number" + +def test_link_descriptions(): + """Test handling of link titles and descriptions.""" + markdown = """ + Here's a [link with title](/test "Test Title") and a [link with description](/other) to test. + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + assert "Test Title" in result.references_markdown, "Link title should be in references" + assert "link with description" in result.references_markdown, "Link text should be in references" + +def test_performance_large_document(): + """Test performance with large document.""" + with open(__location__ + "/data/wikipedia.md", "r") as f: + markdown = f.read() + + # Test with multiple iterations + iterations = 5 + times = [] + + generator = DefaultMarkdownGenerationStrategy() + + for i in range(iterations): + start_time = time.perf_counter() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://en.wikipedia.org" + ) + end_time = time.perf_counter() + times.append(end_time - start_time) + + avg_time = sum(times) / len(times) + print(f"\n{'='*20} Performance Test {'='*20}") + print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds") + print(f"Min time: {min(times):.4f} seconds") + print(f"Max time: {max(times):.4f} seconds") + +def test_image_links(): + """Test handling of image links.""" + markdown = """ + Here's an ![image](/image.png "Image Title") and another ![image](/other.jpg). + And a regular [link](/page). + """ + + generator = DefaultMarkdownGenerationStrategy() + result = generator.generate_markdown( + cleaned_html=markdown, + base_url="https://example.com" + ) + + assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved" + assert "Image Title" in result.references_markdown, "Image title should be in references" + +if __name__ == "__main__": + print("Running markdown generation strategy tests...") + + test_basic_markdown_conversion() + test_relative_links() + test_duplicate_links() + test_link_descriptions() + test_performance_large_document() + test_image_links() + \ No newline at end of file diff --git a/tests/docker_example.py b/tests/docker_example.py new file mode 100644 index 00000000..658e80fd --- /dev/null +++ b/tests/docker_example.py @@ -0,0 +1,332 @@ +import requests +import json +import time +import sys +import base64 +import os +from typing import Dict, Any + +class Crawl4AiTester: + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): + self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} + + def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + # Submit crawl job + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") + task_id = response.json()["task_id"] + print(f"Task ID: {task_id}") + + # Poll for result + start_time = time.time() + while True: + if time.time() - start_time > timeout: + raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") + + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) + status = result.json() + + if status["status"] == "failed": + print("Task failed:", status.get("error")) + raise Exception(f"Task failed: {status.get('error')}") + + if status["status"] == "completed": + return status + + time.sleep(2) + + def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) + if response.status_code == 408: + raise TimeoutError("Task did not complete within server timeout") + response.raise_for_status() + return response.json() + +def test_docker_deployment(version="basic"): + tester = Crawl4AiTester( + # base_url="http://localhost:11235" , + base_url="https://crawl4ai-sby74.ondigitalocean.app", + api_token="test" + ) + print(f"Testing Crawl4AI Docker {version} version") + + # Health check with timeout and retry + max_retries = 5 + for i in range(max_retries): + try: + health = requests.get(f"{tester.base_url}/health", timeout=10) + print("Health check:", health.json()) + break + except requests.exceptions.RequestException as e: + if i == max_retries - 1: + print(f"Failed to connect after {max_retries} attempts") + sys.exit(1) + print(f"Waiting for service to start (attempt {i+1}/{max_retries})...") + time.sleep(5) + + # Test cases based on version + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) + + # if version in ["full", "transformer"]: + # test_cosine_extraction(tester) + + # test_js_execution(tester) + # test_css_selector(tester) + # test_structured_extraction(tester) + # test_llm_extraction(tester) + # test_llm_with_ollama(tester) + # test_screenshot(tester) + + +def test_basic_crawl(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_and_wait(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + assert len(result["result"]["markdown"]) > 0 + +def test_basic_crawl_sync(tester: Crawl4AiTester): + print("\n=== Testing Basic Crawl (Sync) ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 10, + "session_id": "test" + } + + result = tester.submit_sync(request) + print(f"Basic crawl result length: {len(result['result']['markdown'])}") + assert result['status'] == 'completed' + assert result['result']['success'] + assert len(result['result']['markdown']) > 0 + +def test_js_execution(tester: Crawl4AiTester): + print("\n=== Testing JS Execution ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "js_code": [ + "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" + ], + "wait_for": "article.tease-card:nth-child(10)", + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print(f"JS execution result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_css_selector(tester: Crawl4AiTester): + print("\n=== Testing CSS Selector ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 7, + "css_selector": ".wide-tease-item__description", + "crawler_params": { + "headless": True + }, + "extra": {"word_count_threshold": 10} + + } + + result = tester.submit_and_wait(request) + print(f"CSS selector result length: {len(result['result']['markdown'])}") + assert result["result"]["success"] + +def test_structured_extraction(tester: Crawl4AiTester): + print("\n=== Testing Structured Extraction ===") + schema = { + "name": "Coinbase Crypto Prices", + "baseSelector": ".cds-tableRow-t45thuk", + "fields": [ + { + "name": "crypto", + "selector": "td:nth-child(1) h2", + "type": "text", + }, + { + "name": "symbol", + "selector": "td:nth-child(1) p", + "type": "text", + }, + { + "name": "price", + "selector": "td:nth-child(2)", + "type": "text", + } + ], + } + + request = { + "urls": "https://www.coinbase.com/explore", + "priority": 9, + "extraction_config": { + "type": "json_css", + "params": { + "schema": schema + } + } + } + + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} items") + print("Sample item:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + assert len(extracted) > 0 + +def test_llm_extraction(tester: Crawl4AiTester): + print("\n=== Testing LLM Extraction ===") + schema = { + "type": "object", + "properties": { + "model_name": { + "type": "string", + "description": "Name of the OpenAI model." + }, + "input_fee": { + "type": "string", + "description": "Fee for input token for the OpenAI model." + }, + "output_fee": { + "type": "string", + "description": "Fee for output token for the OpenAI model." + } + }, + "required": ["model_name", "input_fee", "output_fee"] + } + + request = { + "urls": "https://openai.com/api/pricing", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": os.getenv("OPENAI_API_KEY"), + "schema": schema, + "extraction_type": "schema", + "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""" + } + }, + "crawler_params": {"word_count_threshold": 1} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} model pricing entries") + print("Sample entry:", json.dumps(extracted[0], indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"LLM extraction test failed (might be due to missing API key): {str(e)}") + +def test_llm_with_ollama(tester: Crawl4AiTester): + print("\n=== Testing LLM with Ollama ===") + schema = { + "type": "object", + "properties": { + "article_title": { + "type": "string", + "description": "The main title of the news article" + }, + "summary": { + "type": "string", + "description": "A brief summary of the article content" + }, + "main_topics": { + "type": "array", + "items": {"type": "string"}, + "description": "Main topics or themes discussed in the article" + } + } + } + + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "llm", + "params": { + "provider": "ollama/llama2", + "schema": schema, + "extraction_type": "schema", + "instruction": "Extract the main article information including title, summary, and main topics." + } + }, + "extra": {"word_count_threshold": 1}, + "crawler_params": {"verbose": True} + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print("Extracted content:", json.dumps(extracted, indent=2)) + assert result["result"]["success"] + except Exception as e: + print(f"Ollama extraction test failed: {str(e)}") + +def test_cosine_extraction(tester: Crawl4AiTester): + print("\n=== Testing Cosine Extraction ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 8, + "extraction_config": { + "type": "cosine", + "params": { + "semantic_filter": "business finance economy", + "word_count_threshold": 10, + "max_dist": 0.2, + "top_k": 3 + } + } + } + + try: + result = tester.submit_and_wait(request) + extracted = json.loads(result["result"]["extracted_content"]) + print(f"Extracted {len(extracted)} text clusters") + print("First cluster tags:", extracted[0]["tags"]) + assert result["result"]["success"] + except Exception as e: + print(f"Cosine extraction test failed: {str(e)}") + +def test_screenshot(tester: Crawl4AiTester): + print("\n=== Testing Screenshot ===") + request = { + "urls": "https://www.nbcnews.com/business", + "priority": 5, + "screenshot": True, + "crawler_params": { + "headless": True + } + } + + result = tester.submit_and_wait(request) + print("Screenshot captured:", bool(result["result"]["screenshot"])) + + if result["result"]["screenshot"]: + # Save screenshot + screenshot_data = base64.b64decode(result["result"]["screenshot"]) + with open("test_screenshot.jpg", "wb") as f: + f.write(screenshot_data) + print("Screenshot saved as test_screenshot.jpg") + + assert result["result"]["success"] + +if __name__ == "__main__": + version = sys.argv[1] if len(sys.argv) > 1 else "basic" + # version = "full" + test_docker_deployment(version) \ No newline at end of file