chore: Update Dockerfile to install chromium-chromedriver and spacy library
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -172,3 +172,5 @@ Crawl4AI.egg-info/
|
|||||||
|
|
||||||
requirements0.txt
|
requirements0.txt
|
||||||
a.txt
|
a.txt
|
||||||
|
|
||||||
|
*.sh
|
||||||
17
Dockerfile
17
Dockerfile
@@ -7,9 +7,6 @@ WORKDIR /usr/src/app
|
|||||||
# Copy the current directory contents into the container at /usr/src/app
|
# Copy the current directory contents into the container at /usr/src/app
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Install any needed packages specified in requirements.txt
|
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
|
||||||
|
|
||||||
# Install dependencies for Chrome and ChromeDriver
|
# Install dependencies for Chrome and ChromeDriver
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
wget \
|
wget \
|
||||||
@@ -20,15 +17,17 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
ca-certificates \
|
ca-certificates \
|
||||||
apt-transport-https \
|
apt-transport-https \
|
||||||
software-properties-common \
|
software-properties-common \
|
||||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
&& mkdir -p /etc/apt/keyrings \
|
||||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
&& curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \
|
||||||
|
&& echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \
|
||||||
&& apt-get update \
|
&& apt-get update \
|
||||||
&& apt-get install -y google-chrome-stable \
|
&& apt-get install -y google-chrome-stable \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& apt install chromium-chromedriver -y
|
&& apt-get install -y chromium-chromedriver
|
||||||
|
|
||||||
# Install spacy library using pip
|
# Install Python dependencies
|
||||||
RUN pip install spacy
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
RUN pip install spacy torch torchvision torchaudio
|
||||||
|
|
||||||
# Set display port and dbus env to avoid hanging
|
# Set display port and dbus env to avoid hanging
|
||||||
ENV DISPLAY=:99
|
ENV DISPLAY=:99
|
||||||
@@ -41,4 +40,4 @@ EXPOSE 80
|
|||||||
ENV PYTHONUNBUFFERED 1
|
ENV PYTHONUNBUFFERED 1
|
||||||
|
|
||||||
# Run uvicorn
|
# Run uvicorn
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||||
|
|||||||
44
Dockerfile_mac
Normal file
44
Dockerfile_mac
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# Use an official Python runtime as a parent image
|
||||||
|
FROM python:3.10-slim
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
|
# Copy the current directory contents into the container at /usr/src/app
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install any needed packages specified in requirements.txt
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Install dependencies for Chrome and ChromeDriver
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
wget \
|
||||||
|
xvfb \
|
||||||
|
unzip \
|
||||||
|
curl \
|
||||||
|
gnupg2 \
|
||||||
|
ca-certificates \
|
||||||
|
apt-transport-https \
|
||||||
|
software-properties-common \
|
||||||
|
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||||
|
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y google-chrome-stable \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
|
&& apt install chromium-chromedriver -y
|
||||||
|
|
||||||
|
# Install spacy library using pip
|
||||||
|
RUN pip install spacy
|
||||||
|
|
||||||
|
# Set display port and dbus env to avoid hanging
|
||||||
|
ENV DISPLAY=:99
|
||||||
|
ENV DBUS_SESSION_BUS_ADDRESS=/dev/null
|
||||||
|
|
||||||
|
# Make port 80 available to the world outside this container
|
||||||
|
EXPOSE 80
|
||||||
|
|
||||||
|
# Define environment variable
|
||||||
|
ENV PYTHONUNBUFFERED 1
|
||||||
|
|
||||||
|
# Run uvicorn
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"]
|
||||||
@@ -28,7 +28,9 @@ To show the simplicity take a look at the first example:
|
|||||||
from crawl4ai import WebCrawler
|
from crawl4ai import WebCrawler
|
||||||
|
|
||||||
# Create the WebCrawler instance
|
# Create the WebCrawler instance
|
||||||
crawler = WebCrawler()
|
crawler = WebCrawler()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Run the crawler with keyword filtering and CSS selector
|
# Run the crawler with keyword filtering and CSS selector
|
||||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||||
@@ -156,11 +158,11 @@ pip install -e .[all]
|
|||||||
docker run -d -p 8000:80 crawl4ai
|
docker run -d -p 8000:80 crawl4ai
|
||||||
```
|
```
|
||||||
|
|
||||||
For more information about how to run Crawl4AI as a local server, please refer to the [GitHub repository](https://github.com/unclecode/crawl4ai).
|
|
||||||
|
|
||||||
## Using the Local server ot REST API 🌐
|
## Using the Local server ot REST API 🌐
|
||||||
|
|
||||||
You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl` [COMING SOON]. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)
|
You can also use Crawl4AI through the REST API. This method allows you to send HTTP requests to the Crawl4AI server and receive structured data in response. The base URL for the API is `https://crawl4ai.com/crawl` [Available now, on a CPU server, of course will be faster on GPU]. If you run the local server, you can use `http://localhost:8000/crawl`. (Port is dependent on your docker configuration)
|
||||||
|
|
||||||
### Example Usage
|
### Example Usage
|
||||||
|
|
||||||
|
|||||||
9
main.py
9
main.py
@@ -2,6 +2,8 @@ import os
|
|||||||
import importlib
|
import importlib
|
||||||
import asyncio
|
import asyncio
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
import logging
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
|
||||||
from fastapi import FastAPI, HTTPException, Request
|
from fastapi import FastAPI, HTTPException, Request
|
||||||
from fastapi.responses import HTMLResponse, JSONResponse
|
from fastapi.responses import HTMLResponse, JSONResponse
|
||||||
@@ -77,7 +79,7 @@ async def get_total_url_count():
|
|||||||
# Add endpoit to clear db
|
# Add endpoit to clear db
|
||||||
@app.get("/clear-db")
|
@app.get("/clear-db")
|
||||||
async def clear_database():
|
async def clear_database():
|
||||||
clear_db()
|
# clear_db()
|
||||||
return JSONResponse(content={"message": "Database cleared."})
|
return JSONResponse(content={"message": "Database cleared."})
|
||||||
|
|
||||||
def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
||||||
@@ -86,12 +88,15 @@ def import_strategy(module_name: str, class_name: str, *args, **kwargs):
|
|||||||
strategy_class = getattr(module, class_name)
|
strategy_class = getattr(module, class_name)
|
||||||
return strategy_class(*args, **kwargs)
|
return strategy_class(*args, **kwargs)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
print("ImportError: Module not found.")
|
||||||
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
|
raise HTTPException(status_code=400, detail=f"Module {module_name} not found.")
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
print("AttributeError: Class not found.")
|
||||||
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
raise HTTPException(status_code=400, detail=f"Class {class_name} not found in {module_name}.")
|
||||||
|
|
||||||
@app.post("/crawl")
|
@app.post("/crawl")
|
||||||
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
||||||
|
logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
|
||||||
global current_requests
|
global current_requests
|
||||||
async with lock:
|
async with lock:
|
||||||
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
if current_requests >= MAX_CONCURRENT_REQUESTS:
|
||||||
@@ -99,10 +104,12 @@ async def crawl_urls(crawl_request: CrawlRequest, request: Request):
|
|||||||
current_requests += 1
|
current_requests += 1
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
logging.debug("[LOG] Loading extraction and chunking strategies...")
|
||||||
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
|
extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
|
||||||
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
|
chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
|
||||||
|
|
||||||
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
|
# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
|
||||||
|
logging.debug("[LOG] Running the WebCrawler...")
|
||||||
with ThreadPoolExecutor() as executor:
|
with ThreadPoolExecutor() as executor:
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
futures = [
|
futures = [
|
||||||
|
|||||||
Reference in New Issue
Block a user