diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5ec79639..309218dc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,64 @@
# Changelog
+## [0.3.746] November 29, 2024
+
+### Major Features
+1. Enhanced Docker Support (Nov 29, 2024)
+ - Improved GPU support in Docker images.
+ - Dockerfile refactored for better platform-specific installations.
+ - Introduced new Docker commands for different platforms:
+ - `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64.
+ - `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64.
+
+### Infrastructure & Documentation
+- Enhanced README.md to improve user guidance and installation instructions.
+- Added installation instructions for Playwright setup in README.
+- Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly.
+- Updated `requirements.txt` with a new `pydantic` dependency.
+- Bumped version number in `crawl4ai/__version__.py` to 0.3.746.
+
+### Breaking Changes
+- Streamlined application structure:
+ - Removed static pages and related code from `main.py` which might affect existing deployments relying on static content.
+
+### Development Updates
+- Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks.
+- Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility.
+- Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities.
+- Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing.
+
+### README.md
+Updated README with new docker commands and setup instructions.
+Enhanced installation instructions and guidance.
+
+### crawl4ai/install.py
+Added post-install script functionality.
+Introduced `post_install` method for automation of post-installation tasks.
+
+### crawl4ai/migrations.py
+Improved migration logging.
+Refined migration processes and added better logging.
+
+### docker-compose.yml
+Refactored docker-compose for better service management.
+Updated to define services for different platforms and versions.
+
+### requirements.txt
+Updated dependencies.
+Added `pydantic` to requirements file.
+
+### crawler/__version__.py
+Updated version number.
+Bumped version number to 0.3.746.
+
+### docs/examples/quickstart_async.py
+Enhanced example scripts.
+Uncommented example usage in async guide for user functionality.
+
+### main.py
+Refactored code to improve maintainability.
+Streamlined app structure by removing static pages code.
+
## [0.3.743] November 27, 2024
Enhance features and documentation
diff --git a/Dockerfile b/Dockerfile
index bd71deae..2997590a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,9 @@
# syntax=docker/dockerfile:1.4
-# Build arguments
+ARG TARGETPLATFORM
+ARG BUILDPLATFORM
+
+# Other build arguments
ARG PYTHON_VERSION=3.10
# Base stage with system dependencies
@@ -63,13 +66,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*
# GPU support if enabled and architecture is supported
-RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \
- apt-get update && apt-get install -y --no-install-recommends \
- nvidia-cuda-toolkit \
- && rm -rf /var/lib/apt/lists/* ; \
- else \
- echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \
- fi
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+ apt-get update && apt-get install -y --no-install-recommends \
+ nvidia-cuda-toolkit \
+ && rm -rf /var/lib/apt/lists/* ; \
+else \
+ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
+fi
# Create and set working directory
WORKDIR /app
@@ -120,7 +123,11 @@ RUN pip install --no-cache-dir \
RUN mkdocs build
# Install Playwright and browsers
-RUN playwright install
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+ playwright install chromium; \
+ elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ playwright install chromium; \
+ fi
# Expose port
EXPOSE 8000 11235 9222 8080
diff --git a/README.md b/README.md
index c9d92e17..405c1002 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
1. Install Crawl4AI:
```bash
pip install crawl4ai
+crawl4ai-setup # Setup the browser
```
2. Run a simple web crawl:
@@ -140,11 +141,12 @@ For basic web crawling and scraping tasks:
```bash
pip install crawl4ai
+crawl4ai-setup # Setup the browser
```
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
-👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
+👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
1. Through the command line:
@@ -218,48 +220,173 @@ Crawl4AI is available as Docker images for easy deployment. You can either pull
---
-### Option 1: Docker Hub (Recommended)
+
+🐳 Option 1: Docker Hub (Recommended)
+Choose the appropriate image based on your platform and needs:
+
+### For AMD64 (Regular Linux/Windows):
```bash
-# Pull and run from Docker Hub (choose one):
-docker pull unclecode/crawl4ai:basic # Basic crawling features
-docker pull unclecode/crawl4ai:all # Full installation (ML, LLM support)
-docker pull unclecode/crawl4ai:gpu # GPU-enabled version
+# Basic version (recommended)
+docker pull unclecode/crawl4ai:basic-amd64
+docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64
-# Run the container
-docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version
+# Full ML/LLM support
+docker pull unclecode/crawl4ai:all-amd64
+docker run -p 11235:11235 unclecode/crawl4ai:all-amd64
-# In case you want to set platform to arm64
-docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic
-
-# In case to allocate more shared memory for the container
-docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic
+# With GPU support
+docker pull unclecode/crawl4ai:gpu-amd64
+docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64
```
----
+### For ARM64 (M1/M2 Macs, ARM servers):
+```bash
+# Basic version (recommended)
+docker pull unclecode/crawl4ai:basic-arm64
+docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64
-### Option 2: Build from Repository
+# Full ML/LLM support
+docker pull unclecode/crawl4ai:all-arm64
+docker run -p 11235:11235 unclecode/crawl4ai:all-arm64
+
+# With GPU support
+docker pull unclecode/crawl4ai:gpu-arm64
+docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64
+```
+
+Need more memory? Add `--shm-size`:
+```bash
+docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64
+```
+
+Test the installation:
+```bash
+curl http://localhost:11235/health
+```
+
+### For Raspberry Pi (32-bit) (coming soon):
+```bash
+# Pull and run basic version (recommended for Raspberry Pi)
+docker pull unclecode/crawl4ai:basic-armv7
+docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7
+
+# With increased shared memory if needed
+docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7
+```
+
+Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi.
+
+
+
+
+🐳 Option 2: Build from Repository
+
+Build the image locally based on your platform:
```bash
# Clone the repository
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
-# Build the image
-docker build -t crawl4ai:local \
- --build-arg INSTALL_TYPE=basic \ # Options: basic, all
+# For AMD64 (Regular Linux/Windows)
+docker build --platform linux/amd64 \
+ --tag crawl4ai:local \
+ --build-arg INSTALL_TYPE=basic \
.
-# In case you want to set platform to arm64
-docker build -t crawl4ai:local \
- --build-arg INSTALL_TYPE=basic \ # Options: basic, all
- --platform linux/arm64 \
+# For ARM64 (M1/M2 Macs, ARM servers)
+docker build --platform linux/arm64 \
+ --tag crawl4ai:local \
+ --build-arg INSTALL_TYPE=basic \
.
-
-# Run your local build
-docker run -p 11235:11235 crawl4ai:local
```
+Build options:
+- INSTALL_TYPE=basic (default): Basic crawling features
+- INSTALL_TYPE=all: Full ML/LLM support
+- ENABLE_GPU=true: Add GPU support
+
+Example with all options:
+```bash
+docker build --platform linux/amd64 \
+ --tag crawl4ai:local \
+ --build-arg INSTALL_TYPE=all \
+ --build-arg ENABLE_GPU=true \
+ .
+```
+
+Run your local build:
+```bash
+# Regular run
+docker run -p 11235:11235 crawl4ai:local
+
+# With increased shared memory
+docker run --shm-size=2gb -p 11235:11235 crawl4ai:local
+```
+
+Test the installation:
+```bash
+curl http://localhost:11235/health
+```
+
+
+
+
+🐳 Option 3: Using Docker Compose
+
+Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations.
+
+```bash
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+### For AMD64 (Regular Linux/Windows):
+```bash
+# Build and run locally
+docker-compose --profile local-amd64 up
+
+# Run from Docker Hub
+VERSION=basic docker-compose --profile hub-amd64 up # Basic version
+VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support
+VERSION=gpu docker-compose --profile hub-amd64 up # GPU support
+```
+
+### For ARM64 (M1/M2 Macs, ARM servers):
+```bash
+# Build and run locally
+docker-compose --profile local-arm64 up
+
+# Run from Docker Hub
+VERSION=basic docker-compose --profile hub-arm64 up # Basic version
+VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support
+VERSION=gpu docker-compose --profile hub-arm64 up # GPU support
+```
+
+Environment variables (optional):
+```bash
+# Create a .env file
+CRAWL4AI_API_TOKEN=your_token
+OPENAI_API_KEY=your_openai_key
+CLAUDE_API_KEY=your_claude_key
+```
+
+The compose file includes:
+- Memory management (4GB limit, 1GB reserved)
+- Shared memory volume for browser support
+- Health checks
+- Auto-restart policy
+- All necessary port mappings
+
+Test the installation:
+```bash
+curl http://localhost:11235/health
+```
+
+
+
---
### Quick Test
@@ -276,11 +403,11 @@ response = requests.post(
)
task_id = response.json()["task_id"]
-# Get results
+# Continue polling until the task is complete (status="completed")
result = requests.get(f"http://localhost:11235/task/{task_id}")
```
-For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/).
+For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/).
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 0ccf13d8..cee7c25b 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode
from .models import CrawlResult
from .__version__ import __version__
-# __version__ = "0.3.73"
__all__ = [
"AsyncWebCrawler",
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 8b69d491..4a938b75 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
# crawl4ai/_version.py
-__version__ = "0.3.745"
+__version__ = "0.3.746"
diff --git a/crawl4ai/install.py b/crawl4ai/install.py
new file mode 100644
index 00000000..71fe30ea
--- /dev/null
+++ b/crawl4ai/install.py
@@ -0,0 +1,44 @@
+import subprocess
+import sys
+import asyncio
+from .async_logger import AsyncLogger, LogLevel
+
+# Initialize logger
+logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
+
+def post_install():
+ """Run all post-installation tasks"""
+ logger.info("Running post-installation setup...", tag="INIT")
+ install_playwright()
+ run_migration()
+ logger.success("Post-installation setup completed!", tag="COMPLETE")
+
+def install_playwright():
+ logger.info("Installing Playwright browsers...", tag="INIT")
+ try:
+ subprocess.check_call([sys.executable, "-m", "playwright", "install"])
+ logger.success("Playwright installation completed successfully.", tag="COMPLETE")
+ except subprocess.CalledProcessError as e:
+ logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
+ logger.warning(
+ "Please run 'python -m playwright install' manually after the installation."
+ )
+ except Exception as e:
+ logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
+ logger.warning(
+ "Please run 'python -m playwright install' manually after the installation."
+ )
+
+def run_migration():
+ """Initialize database during installation"""
+ try:
+ logger.info("Starting database initialization...", tag="INIT")
+ from crawl4ai.async_database import async_db_manager
+
+ asyncio.run(async_db_manager.initialize())
+ logger.success("Database initialization completed successfully.", tag="COMPLETE")
+ except ImportError:
+ logger.warning("Database module not found. Will initialize on first use.")
+ except Exception as e:
+ logger.warning(f"Database initialization failed: {e}")
+ logger.warning("Database will be initialized on first use")
\ No newline at end of file
diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py
index 77616086..3386b0fb 100644
--- a/crawl4ai/migrations.py
+++ b/crawl4ai/migrations.py
@@ -9,9 +9,13 @@ import aiofiles
import shutil
import time
from datetime import datetime
+from .async_logger import AsyncLogger, LogLevel
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
+# Initialize logger
+logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
+
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
class DatabaseMigration:
def __init__(self, db_path: str):
@@ -55,7 +59,8 @@ class DatabaseMigration:
async def migrate_database(self):
"""Migrate existing database to file-based storage"""
- logger.info("Starting database migration...")
+ # logger.info("Starting database migration...")
+ logger.info("Starting database migration...", tag="INIT")
try:
async with aiosqlite.connect(self.db_path) as db:
@@ -91,19 +96,25 @@ class DatabaseMigration:
migrated_count += 1
if migrated_count % 100 == 0:
- logger.info(f"Migrated {migrated_count} records...")
+ logger.info(f"Migrated {migrated_count} records...", tag="INIT")
+
await db.commit()
- logger.info(f"Migration completed. {migrated_count} records processed.")
+ logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
except Exception as e:
- logger.error(f"Migration failed: {e}")
- raise
+ # logger.error(f"Migration failed: {e}")
+ logger.error(
+ message="Migration failed: {error}",
+ tag="ERROR",
+ params={"error": str(e)}
+ )
+ raise e
async def backup_database(db_path: str) -> str:
"""Create backup of existing database"""
if not os.path.exists(db_path):
- logger.info("No existing database found. Skipping backup.")
+ logger.info("No existing database found. Skipping backup.", tag="INIT")
return None
# Create backup with timestamp
@@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str:
# Create backup
shutil.copy2(db_path, backup_path)
- logger.info(f"Database backup created at: {backup_path}")
+ logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
return backup_path
except Exception as e:
- logger.error(f"Backup failed: {e}")
- raise
+ # logger.error(f"Backup failed: {e}")
+ logger.error(
+ message="Migration failed: {error}",
+ tag="ERROR",
+ params={"error": str(e)}
+ )
+ raise e
async def run_migration(db_path: Optional[str] = None):
"""Run database migration"""
@@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None):
db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
if not os.path.exists(db_path):
- logger.info("No existing database found. Skipping migration.")
+ logger.info("No existing database found. Skipping migration.", tag="INIT")
return
# Create backup first
diff --git a/docker-compose.yml b/docker-compose.yml
index b93beda9..4b22fd98 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,6 @@
services:
- crawl4ai:
+ # Local build services for different platforms
+ crawl4ai-amd64:
build:
context: .
dockerfile: Dockerfile
@@ -7,35 +8,39 @@ services:
PYTHON_VERSION: "3.10"
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
ENABLE_GPU: false
- profiles: ["local"]
- ports:
- - "11235:11235"
- - "8000:8000"
- - "9222:9222"
- - "8080:8080"
- environment:
- - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
- - OPENAI_API_KEY=${OPENAI_API_KEY:-}
- - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
- volumes:
- - /dev/shm:/dev/shm
- deploy:
- resources:
- limits:
- memory: 4G
- reservations:
- memory: 1G
- restart: unless-stopped
- healthcheck:
- test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
- interval: 30s
- timeout: 10s
- retries: 3
- start_period: 40s
+ platforms:
+ - linux/amd64
+ profiles: ["local-amd64"]
+ extends: &base-config
+ file: docker-compose.yml
+ service: base-config
- crawl4ai-hub:
- image: unclecode/crawl4ai:basic
- profiles: ["hub"]
+ crawl4ai-arm64:
+ build:
+ context: .
+ dockerfile: Dockerfile
+ args:
+ PYTHON_VERSION: "3.10"
+ INSTALL_TYPE: ${INSTALL_TYPE:-basic}
+ ENABLE_GPU: false
+ platforms:
+ - linux/arm64
+ profiles: ["local-arm64"]
+ extends: *base-config
+
+ # Hub services for different platforms and versions
+ crawl4ai-hub-amd64:
+ image: unclecode/crawl4ai:${VERSION:-basic}-amd64
+ profiles: ["hub-amd64"]
+ extends: *base-config
+
+ crawl4ai-hub-arm64:
+ image: unclecode/crawl4ai:${VERSION:-basic}-arm64
+ profiles: ["hub-arm64"]
+ extends: *base-config
+
+ # Base configuration to be extended
+ base-config:
ports:
- "11235:11235"
- "8000:8000"
@@ -59,4 +64,4 @@ services:
interval: 30s
timeout: 10s
retries: 3
- start_period: 40s
+ start_period: 40s
\ No newline at end of file
diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py
index 17ef9f04..48acc809 100644
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -78,20 +78,20 @@ def test_docker_deployment(version="basic"):
time.sleep(5)
# Test cases based on version
- # test_basic_crawl(tester)
- # test_basic_crawl(tester)
- # test_basic_crawl_sync(tester)
test_basic_crawl_direct(tester)
+ test_basic_crawl(tester)
+ test_basic_crawl(tester)
+ test_basic_crawl_sync(tester)
- # if version in ["full", "transformer"]:
- # test_cosine_extraction(tester)
+ if version in ["full", "transformer"]:
+ test_cosine_extraction(tester)
- # test_js_execution(tester)
- # test_css_selector(tester)
- # test_structured_extraction(tester)
- # test_llm_extraction(tester)
- # test_llm_with_ollama(tester)
- # test_screenshot(tester)
+ test_js_execution(tester)
+ test_css_selector(tester)
+ test_structured_extraction(tester)
+ test_llm_extraction(tester)
+ test_llm_with_ollama(tester)
+ test_screenshot(tester)
def test_basic_crawl(tester: Crawl4AiTester):
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
index 9f1eff53..679a9bc2 100644
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com")
async def simple_crawl():
print("\n--- Basic Usage ---")
async with AsyncWebCrawler(verbose=True) as crawler:
- result = await crawler.arun(url="https://www.nbcnews.com/business")
+ result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
print(result.markdown[:500]) # Print first 500 characters
async def simple_example_with_running_js_code():
@@ -76,16 +76,17 @@ async def use_proxy():
async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
- bypass_cache=True
+ cache_mode= CacheMode.BYPASS
)
- print(result.markdown[:500]) # Print first 500 characters
+ if result.success:
+ print(result.markdown[:500]) # Print first 500 characters
async def capture_and_save_screenshot(url: str, output_path: str):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=url,
screenshot=True,
- bypass_cache=True
+ cache_mode= CacheMode.BYPASS
)
if result.success and result.screenshot:
@@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
schema = {
- "name": "Coinbase Crypto Prices",
- "baseSelector": ".cds-tableRow-t45thuk",
- "fields": [
- {
- "name": "crypto",
- "selector": "td:nth-child(1) h2",
- "type": "text",
- },
- {
- "name": "symbol",
- "selector": "td:nth-child(1) p",
- "type": "text",
- },
- {
- "name": "price",
- "selector": "td:nth-child(2)",
- "type": "text",
+ "name": "KidoCode Courses",
+ "baseSelector": "section.charge-methodology .w-tab-content > div",
+ "fields": [
+ {
+ "name": "section_title",
+ "selector": "h3.heading-50",
+ "type": "text",
+ },
+ {
+ "name": "section_description",
+ "selector": ".charge-content",
+ "type": "text",
+ },
+ {
+ "name": "course_name",
+ "selector": ".text-block-93",
+ "type": "text",
+ },
+ {
+ "name": "course_description",
+ "selector": ".course-content-text",
+ "type": "text",
+ },
+ {
+ "name": "course_icon",
+ "selector": ".image-92",
+ "type": "attribute",
+ "attribute": "src"
+ }
+ ]
+}
+
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True
+ ) as crawler:
+
+ # Create the JavaScript that handles clicking multiple times
+ js_click_tabs = """
+ (async () => {
+ const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+
+ for(let tab of tabs) {
+ // scroll to the tab
+ tab.scrollIntoView();
+ tab.click();
+ // Wait for content to load and animations to complete
+ await new Promise(r => setTimeout(r, 500));
}
- ],
- }
+ })();
+ """
- extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
- async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
- url="https://www.coinbase.com/explore",
- extraction_strategy=extraction_strategy,
- cache_mode=CacheMode.BYPASS,
+ url="https://www.kidocode.com/degrees/technology",
+ extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
+ js_code=[js_click_tabs],
+ cache_mode=CacheMode.BYPASS
)
- assert result.success, "Failed to crawl the page"
-
- news_teasers = json.loads(result.extracted_content)
- print(f"Successfully extracted {len(news_teasers)} news teasers")
- print(json.dumps(news_teasers[0], indent=2))
+ companies = json.loads(result.extracted_content)
+ print(f"Successfully extracted {len(companies)} companies")
+ print(json.dumps(companies[0], indent=2))
# Advanced Session-Based Crawling with Dynamic Content 🔄
async def crawl_dynamic_content_pages_method_1():
@@ -363,21 +391,21 @@ async def crawl_custom_browser_type():
# Use Firefox
start = time.time()
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
- result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+ result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
# Use WebKit
start = time.time()
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
- result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+ result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
# Use Chromium (default)
start = time.time()
async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
- result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+ result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
@@ -537,7 +565,7 @@ async def main():
await simple_crawl()
await simple_example_with_running_js_code()
await simple_example_with_css_selector()
- await use_proxy()
+ # await use_proxy()
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
await extract_structured_data_using_css_extractor()
@@ -548,14 +576,14 @@ async def main():
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# You always can pass custom headers to the extraction strategy
- custom_headers = {
- "Authorization": "Bearer your-custom-token",
- "X-Custom-Header": "Some-Value"
- }
- await extract_structured_data_using_llm(extra_headers=custom_headers)
+ # custom_headers = {
+ # "Authorization": "Bearer your-custom-token",
+ # "X-Custom-Header": "Some-Value"
+ # }
+ # await extract_structured_data_using_llm(extra_headers=custom_headers)
- # await crawl_dynamic_content_pages_method_1()
- # await crawl_dynamic_content_pages_method_2()
+ await crawl_dynamic_content_pages_method_1()
+ await crawl_dynamic_content_pages_method_2()
await crawl_dynamic_content_pages_method_3()
await crawl_custom_browser_type()
diff --git a/main.py b/main.py
index 6d217410..d6c792e8 100644
--- a/main.py
+++ b/main.py
@@ -340,9 +340,6 @@ app.add_middleware(
allow_headers=["*"], # Allows all headers
)
-# Mount the pages directory as a static directory
-app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
-
# API token security
security = HTTPBearer()
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
@@ -364,7 +361,6 @@ if os.path.exists(__location__ + "/site"):
app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs")
site_templates = Jinja2Templates(directory=__location__ + "/site")
-templates = Jinja2Templates(directory=__location__ + "/pages")
crawler_service = CrawlerService()
diff --git a/requirements.txt b/requirements.txt
index ed259ac9..741e12ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +1,16 @@
aiosqlite~=0.20
-html2text~=2024.2
lxml~=5.3
-litellm~=1.48
+litellm>=1.53.1
numpy>=1.26.0,<3
pillow~=10.4
-playwright>=1.47,<1.48
+playwright>=1.49.0
python-dotenv~=1.0
requests~=2.26
beautifulsoup4~=4.12
-tf-playwright-stealth~=1.0
+tf-playwright-stealth>=1.1.0
xxhash~=3.4
rank-bm25~=0.2
-aiofiles~=24.0
+aiofiles>=24.1.0
colorama~=0.4
-snowballstemmer~=2.2
\ No newline at end of file
+snowballstemmer~=2.2
+pydantic>=2.10
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d44169bf..e6840cd0 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,8 @@
from setuptools import setup, find_packages
-from setuptools.command.install import install
import os
from pathlib import Path
import shutil
-import subprocess
-import sys
-import asyncio
+
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
@@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"]
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
sync_requirements = ["selenium"]
-
-def install_playwright():
- print("Installing Playwright browsers...")
- try:
- subprocess.check_call([sys.executable, "-m", "playwright", "install"])
- print("Playwright installation completed successfully.")
- except subprocess.CalledProcessError as e:
- print(f"Error during Playwright installation: {e}")
- print(
- "Please run 'python -m playwright install' manually after the installation."
- )
- except Exception as e:
- print(f"Unexpected error during Playwright installation: {e}")
- print(
- "Please run 'python -m playwright install' manually after the installation."
- )
-
-
-def run_migration():
- """Initialize database during installation"""
- try:
- print("Starting database initialization...")
- from crawl4ai.async_database import async_db_manager
-
- asyncio.run(async_db_manager.initialize())
- print("Database initialization completed successfully.")
- except ImportError:
- print("Warning: Database module not found. Will initialize on first use.")
- except Exception as e:
- print(f"Warning: Database initialization failed: {e}")
- print("Database will be initialized on first use")
-
-
-class PostInstallCommand(install):
- def run(self):
- install.run(self)
- install_playwright()
- # run_migration()
-
-
setup(
name="Crawl4AI",
version=version,
@@ -116,7 +73,8 @@ setup(
entry_points={
"console_scripts": [
"crawl4ai-download-models=crawl4ai.model_loader:main",
- "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command
+ "crawl4ai-migrate=crawl4ai.migrations:main",
+ 'crawl4ai-setup=crawl4ai.install:post_install',
],
},
classifiers=[
@@ -130,7 +88,4 @@ setup(
"Programming Language :: Python :: 3.10",
],
python_requires=">=3.7",
- cmdclass={
- "install": PostInstallCommand,
- },
)