diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ec79639..309218dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,64 @@ # Changelog +## [0.3.746] November 29, 2024 + +### Major Features +1. Enhanced Docker Support (Nov 29, 2024) + - Improved GPU support in Docker images. + - Dockerfile refactored for better platform-specific installations. + - Introduced new Docker commands for different platforms: + - `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64. + - `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64. + +### Infrastructure & Documentation +- Enhanced README.md to improve user guidance and installation instructions. +- Added installation instructions for Playwright setup in README. +- Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly. +- Updated `requirements.txt` with a new `pydantic` dependency. +- Bumped version number in `crawl4ai/__version__.py` to 0.3.746. + +### Breaking Changes +- Streamlined application structure: + - Removed static pages and related code from `main.py` which might affect existing deployments relying on static content. + +### Development Updates +- Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks. +- Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility. +- Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities. +- Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing. + +### README.md +Updated README with new docker commands and setup instructions. +Enhanced installation instructions and guidance. + +### crawl4ai/install.py +Added post-install script functionality. +Introduced `post_install` method for automation of post-installation tasks. + +### crawl4ai/migrations.py +Improved migration logging. +Refined migration processes and added better logging. + +### docker-compose.yml +Refactored docker-compose for better service management. +Updated to define services for different platforms and versions. + +### requirements.txt +Updated dependencies. +Added `pydantic` to requirements file. + +### crawler/__version__.py +Updated version number. +Bumped version number to 0.3.746. + +### docs/examples/quickstart_async.py +Enhanced example scripts. +Uncommented example usage in async guide for user functionality. + +### main.py +Refactored code to improve maintainability. +Streamlined app structure by removing static pages code. + ## [0.3.743] November 27, 2024 Enhance features and documentation diff --git a/Dockerfile b/Dockerfile index bd71deae..2997590a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,9 @@ # syntax=docker/dockerfile:1.4 -# Build arguments +ARG TARGETPLATFORM +ARG BUILDPLATFORM + +# Other build arguments ARG PYTHON_VERSION=3.10 # Base stage with system dependencies @@ -63,13 +66,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # GPU support if enabled and architecture is supported -RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ - else \ - echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ - fi +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ +else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ +fi # Create and set working directory WORKDIR /app @@ -120,7 +123,11 @@ RUN pip install --no-cache-dir \ RUN mkdocs build # Install Playwright and browsers -RUN playwright install +RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ + playwright install chromium; \ + elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + playwright install chromium; \ + fi # Expose port EXPOSE 8000 11235 9222 8080 diff --git a/README.md b/README.md index c9d92e17..405c1002 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 1. Install Crawl4AI: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` 2. Run a simple web crawl: @@ -140,11 +141,12 @@ For basic web crawling and scraping tasks: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. -👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: +👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 1. Through the command line: @@ -218,48 +220,173 @@ Crawl4AI is available as Docker images for easy deployment. You can either pull --- -### Option 1: Docker Hub (Recommended) +
+🐳 Option 1: Docker Hub (Recommended) +Choose the appropriate image based on your platform and needs: + +### For AMD64 (Regular Linux/Windows): ```bash -# Pull and run from Docker Hub (choose one): -docker pull unclecode/crawl4ai:basic # Basic crawling features -docker pull unclecode/crawl4ai:all # Full installation (ML, LLM support) -docker pull unclecode/crawl4ai:gpu # GPU-enabled version +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 -# Run the container -docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:all-amd64 -# In case you want to set platform to arm64 -docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic - -# In case to allocate more shared memory for the container -docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic +# With GPU support +docker pull unclecode/crawl4ai:gpu-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64 ``` ---- +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 -### Option 2: Build from Repository +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:all-arm64 + +# With GPU support +docker pull unclecode/crawl4ai:gpu-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64 +``` + +Need more memory? Add `--shm-size`: +```bash +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64 +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +### For Raspberry Pi (32-bit) (coming soon): +```bash +# Pull and run basic version (recommended for Raspberry Pi) +docker pull unclecode/crawl4ai:basic-armv7 +docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7 + +# With increased shared memory if needed +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7 +``` + +Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi. + +
+ +
+🐳 Option 2: Build from Repository + +Build the image locally based on your platform: ```bash # Clone the repository git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -# Build the image -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all +# For AMD64 (Regular Linux/Windows) +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . -# In case you want to set platform to arm64 -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all - --platform linux/arm64 \ +# For ARM64 (M1/M2 Macs, ARM servers) +docker build --platform linux/arm64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . - -# Run your local build -docker run -p 11235:11235 crawl4ai:local ``` +Build options: +- INSTALL_TYPE=basic (default): Basic crawling features +- INSTALL_TYPE=all: Full ML/LLM support +- ENABLE_GPU=true: Add GPU support + +Example with all options: +```bash +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=true \ + . +``` + +Run your local build: +```bash +# Regular run +docker run -p 11235:11235 crawl4ai:local + +# With increased shared memory +docker run --shm-size=2gb -p 11235:11235 crawl4ai:local +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
+ +
+🐳 Option 3: Using Docker Compose + +Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations. + +```bash +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +``` + +### For AMD64 (Regular Linux/Windows): +```bash +# Build and run locally +docker-compose --profile local-amd64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-amd64 up # Basic version +VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-amd64 up # GPU support +``` + +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Build and run locally +docker-compose --profile local-arm64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-arm64 up # Basic version +VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-arm64 up # GPU support +``` + +Environment variables (optional): +```bash +# Create a .env file +CRAWL4AI_API_TOKEN=your_token +OPENAI_API_KEY=your_openai_key +CLAUDE_API_KEY=your_claude_key +``` + +The compose file includes: +- Memory management (4GB limit, 1GB reserved) +- Shared memory volume for browser support +- Health checks +- Auto-restart policy +- All necessary port mappings + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
+ --- ### Quick Test @@ -276,11 +403,11 @@ response = requests.post( ) task_id = response.json()["task_id"] -# Get results +# Continue polling until the task is complete (status="completed") result = requests.get(f"http://localhost:11235/task/{task_id}") ``` -For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). +For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0ccf13d8..cee7c25b 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode from .models import CrawlResult from .__version__ import __version__ -# __version__ = "0.3.73" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 8b69d491..4a938b75 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.745" +__version__ = "0.3.746" diff --git a/crawl4ai/install.py b/crawl4ai/install.py new file mode 100644 index 00000000..71fe30ea --- /dev/null +++ b/crawl4ai/install.py @@ -0,0 +1,44 @@ +import subprocess +import sys +import asyncio +from .async_logger import AsyncLogger, LogLevel + +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +def post_install(): + """Run all post-installation tasks""" + logger.info("Running post-installation setup...", tag="INIT") + install_playwright() + run_migration() + logger.success("Post-installation setup completed!", tag="COMPLETE") + +def install_playwright(): + logger.info("Installing Playwright browsers...", tag="INIT") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + logger.success("Playwright installation completed successfully.", tag="COMPLETE") + except subprocess.CalledProcessError as e: + logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + +def run_migration(): + """Initialize database during installation""" + try: + logger.info("Starting database initialization...", tag="INIT") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + logger.success("Database initialization completed successfully.", tag="COMPLETE") + except ImportError: + logger.warning("Database module not found. Will initialize on first use.") + except Exception as e: + logger.warning(f"Database initialization failed: {e}") + logger.warning("Database will be initialized on first use") \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py index 77616086..3386b0fb 100644 --- a/crawl4ai/migrations.py +++ b/crawl4ai/migrations.py @@ -9,9 +9,13 @@ import aiofiles import shutil import time from datetime import datetime +from .async_logger import AsyncLogger, LogLevel -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) class DatabaseMigration: def __init__(self, db_path: str): @@ -55,7 +59,8 @@ class DatabaseMigration: async def migrate_database(self): """Migrate existing database to file-based storage""" - logger.info("Starting database migration...") + # logger.info("Starting database migration...") + logger.info("Starting database migration...", tag="INIT") try: async with aiosqlite.connect(self.db_path) as db: @@ -91,19 +96,25 @@ class DatabaseMigration: migrated_count += 1 if migrated_count % 100 == 0: - logger.info(f"Migrated {migrated_count} records...") + logger.info(f"Migrated {migrated_count} records...", tag="INIT") + await db.commit() - logger.info(f"Migration completed. {migrated_count} records processed.") + logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE") except Exception as e: - logger.error(f"Migration failed: {e}") - raise + # logger.error(f"Migration failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def backup_database(db_path: str) -> str: """Create backup of existing database""" if not os.path.exists(db_path): - logger.info("No existing database found. Skipping backup.") + logger.info("No existing database found. Skipping backup.", tag="INIT") return None # Create backup with timestamp @@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str: # Create backup shutil.copy2(db_path, backup_path) - logger.info(f"Database backup created at: {backup_path}") + logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE") return backup_path except Exception as e: - logger.error(f"Backup failed: {e}") - raise + # logger.error(f"Backup failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def run_migration(db_path: Optional[str] = None): """Run database migration""" @@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None): db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") if not os.path.exists(db_path): - logger.info("No existing database found. Skipping migration.") + logger.info("No existing database found. Skipping migration.", tag="INIT") return # Create backup first diff --git a/docker-compose.yml b/docker-compose.yml index b93beda9..4b22fd98 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ services: - crawl4ai: + # Local build services for different platforms + crawl4ai-amd64: build: context: . dockerfile: Dockerfile @@ -7,35 +8,39 @@ services: PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: false - profiles: ["local"] - ports: - - "11235:11235" - - "8000:8000" - - "9222:9222" - - "8080:8080" - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} - - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} - volumes: - - /dev/shm:/dev/shm - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s + platforms: + - linux/amd64 + profiles: ["local-amd64"] + extends: &base-config + file: docker-compose.yml + service: base-config - crawl4ai-hub: - image: unclecode/crawl4ai:basic - profiles: ["hub"] + crawl4ai-arm64: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} + ENABLE_GPU: false + platforms: + - linux/arm64 + profiles: ["local-arm64"] + extends: *base-config + + # Hub services for different platforms and versions + crawl4ai-hub-amd64: + image: unclecode/crawl4ai:${VERSION:-basic}-amd64 + profiles: ["hub-amd64"] + extends: *base-config + + crawl4ai-hub-arm64: + image: unclecode/crawl4ai:${VERSION:-basic}-arm64 + profiles: ["hub-arm64"] + extends: *base-config + + # Base configuration to be extended + base-config: ports: - "11235:11235" - "8000:8000" @@ -59,4 +64,4 @@ services: interval: 30s timeout: 10s retries: 3 - start_period: 40s + start_period: 40s \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 17ef9f04..48acc809 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -78,20 +78,20 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - # test_basic_crawl(tester) - # test_basic_crawl(tester) - # test_basic_crawl_sync(tester) test_basic_crawl_direct(tester) + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) - # if version in ["full", "transformer"]: - # test_cosine_extraction(tester) + if version in ["full", "transformer"]: + test_cosine_extraction(tester) - # test_js_execution(tester) - # test_css_selector(tester) - # test_structured_extraction(tester) - # test_llm_extraction(tester) - # test_llm_with_ollama(tester) - # test_screenshot(tester) + test_js_execution(tester) + test_css_selector(tester) + test_structured_extraction(tester) + test_llm_extraction(tester) + test_llm_with_ollama(tester) + test_screenshot(tester) def test_basic_crawl(tester: Crawl4AiTester): diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9f1eff53..679a9bc2 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com") async def simple_crawl(): print("\n--- Basic Usage ---") async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") + result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) # Print first 500 characters async def simple_example_with_running_js_code(): @@ -76,16 +76,17 @@ async def use_proxy(): async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", - bypass_cache=True + cache_mode= CacheMode.BYPASS ) - print(result.markdown[:500]) # Print first 500 characters + if result.success: + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode= CacheMode.BYPASS ) if result.success and result.screenshot: @@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") schema = { - "name": "Coinbase Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "symbol", - "selector": "td:nth-child(1) p", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} + + async with AsyncWebCrawler( + headless=True, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); } - ], - } + })(); + """ - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - - async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://www.coinbase.com/explore", - extraction_strategy=extraction_strategy, - cache_mode=CacheMode.BYPASS, + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS ) - assert result.success, "Failed to crawl the page" - - news_teasers = json.loads(result.extracted_content) - print(f"Successfully extracted {len(news_teasers)} news teasers") - print(json.dumps(news_teasers[0], indent=2)) + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) # Advanced Session-Based Crawling with Dynamic Content 🔄 async def crawl_dynamic_content_pages_method_1(): @@ -363,21 +391,21 @@ async def crawl_custom_browser_type(): # Use Firefox start = time.time() async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use WebKit start = time.time() async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use Chromium (default) start = time.time() async with AsyncWebCrawler(verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) @@ -537,7 +565,7 @@ async def main(): await simple_crawl() await simple_example_with_running_js_code() await simple_example_with_css_selector() - await use_proxy() + # await use_proxy() await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) await extract_structured_data_using_css_extractor() @@ -548,14 +576,14 @@ async def main(): await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy - custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" - } - await extract_structured_data_using_llm(extra_headers=custom_headers) + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_1() + await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/main.py b/main.py index 6d217410..d6c792e8 100644 --- a/main.py +++ b/main.py @@ -340,9 +340,6 @@ app.add_middleware( allow_headers=["*"], # Allows all headers ) -# Mount the pages directory as a static directory -app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") - # API token security security = HTTPBearer() CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" @@ -364,7 +361,6 @@ if os.path.exists(__location__ + "/site"): app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") site_templates = Jinja2Templates(directory=__location__ + "/site") -templates = Jinja2Templates(directory=__location__ + "/pages") crawler_service = CrawlerService() diff --git a/requirements.txt b/requirements.txt index ed259ac9..741e12ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ aiosqlite~=0.20 -html2text~=2024.2 lxml~=5.3 -litellm~=1.48 +litellm>=1.53.1 numpy>=1.26.0,<3 pillow~=10.4 -playwright>=1.47,<1.48 +playwright>=1.49.0 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 -tf-playwright-stealth~=1.0 +tf-playwright-stealth>=1.1.0 xxhash~=3.4 rank-bm25~=0.2 -aiofiles~=24.0 +aiofiles>=24.1.0 colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file +snowballstemmer~=2.2 +pydantic>=2.10 \ No newline at end of file diff --git a/setup.py b/setup.py index d44169bf..e6840cd0 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,8 @@ from setuptools import setup, find_packages -from setuptools.command.install import install import os from pathlib import Path import shutil -import subprocess -import sys -import asyncio + # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder @@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - setup( name="Crawl4AI", version=version, @@ -116,7 +73,8 @@ setup( entry_points={ "console_scripts": [ "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + "crawl4ai-migrate=crawl4ai.migrations:main", + 'crawl4ai-setup=crawl4ai.install:post_install', ], }, classifiers=[ @@ -130,7 +88,4 @@ setup( "Programming Language :: Python :: 3.10", ], python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, )