feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile

feat(requirements): update requirements.txt to include snowballstemmer
fix(version_manager): correct version parsing to use __version__.__version__
feat(main): introduce chunking strategy and content filter in CrawlRequest model
feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance
feat(logger): implement new async logger engine replacing print statements throughout library
fix(database): resolve version-related deadlock and circular lock issues in database operations
docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose
This commit is contained in:
UncleCode
2024-11-18 21:00:06 +08:00
parent 152ac35bc2
commit 852729ff38
15 changed files with 952 additions and 232 deletions

View File

@@ -1,9 +1,16 @@
import os, sys
# append the parent directory to the sys.path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
__data__ = os.path.join(__location__, "__data")
import asyncio
import os
from pathlib import Path
import aiohttp
import json
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
# 1. File Download Processing Example
@@ -32,7 +39,8 @@ async def download_example():
console.log('No .exe download link found');
}
""",
wait_for=5 # Wait 5 seconds to ensure download starts
delay_before_return_html=1, # Wait 5 seconds to ensure download starts
cache_mode=CacheMode.BYPASS
)
if result.downloaded_files:
@@ -50,22 +58,32 @@ async def content_filtering_example():
async with AsyncWebCrawler(verbose=True) as crawler:
# Create filter with custom query for OpenAI's blog
content_filter = BM25ContentFilter(
user_query="AI language models research innovation",
# user_query="Investment and fundraising",
# user_query="Robotic",
bm25_threshold=1.0
)
result = await crawler.arun(
url="https://openai.com/blog",
content_filter=content_filter
url="https://techcrunch.com/",
content_filter=content_filter,
cache_mode=CacheMode.BYPASS
)
print(f"Filtered content: {result.extracted_content}")
print(f"Filtered content: {len(result.fit_markdown)}")
print(f"Filtered content: {result.fit_markdown}")
# Save html
with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
f.write(result.fit_html)
with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
f.write(result.fit_markdown)
# 3. Local File and Raw HTML Processing Example
async def local_and_raw_html_example():
"""Example of processing local files and raw HTML"""
# Create a sample HTML file
sample_file = "sample.html"
sample_file = os.path.join(__data__, "sample.html")
with open(sample_file, "w") as f:
f.write("""
<html><body>
@@ -112,21 +130,18 @@ async def browser_management_example():
headless=False,
verbose=True
) as crawler:
result = await crawler.arun(
url="https://crawl4ai.com",
# session_id="persistent_session_1",
cache_mode=CacheMode.BYPASS
)
# Use GitHub as an example - it's a good test for browser management
# because it requires proper browser handling
result = await crawler.arun(
url="https://github.com/trending",
session_id="persistent_session_1",
js_code="""
// Custom JavaScript to execute on GitHub's trending page
const repos = document.querySelectorAll('article.Box-row');
const data = Array.from(repos).map(repo => ({
name: repo.querySelector('h2')?.textContent?.trim(),
description: repo.querySelector('p')?.textContent?.trim(),
language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim()
}));
console.log('Trending repositories:', JSON.stringify(data, null, 2));
"""
# session_id="persistent_session_1",
cache_mode=CacheMode.BYPASS
)
print("\nBrowser session result:", result.success)
@@ -136,6 +151,8 @@ async def browser_management_example():
# 5. API Usage Example
async def api_example():
"""Example of using the new API endpoints"""
api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
headers = {'Authorization': f'Bearer {api_token}'}
async with aiohttp.ClientSession() as session:
# Submit crawl job
crawl_request = {
@@ -143,52 +160,78 @@ async def api_example():
"extraction_config": {
"type": "json_css",
"params": {
"selectors": {
"titles": ".title a",
"scores": ".score",
"comments": ".comment-tree"
"schema": {
"name": "Hacker News Articles",
"baseSelector": ".athing",
"fields": [
{
"name": "title",
"selector": ".title a",
"type": "text"
},
{
"name": "score",
"selector": ".score",
"type": "text"
},
{
"name": "url",
"selector": ".title a",
"type": "attribute",
"attribute": "href"
}
]
}
}
},
"crawler_params": {
"headless": True,
"use_managed_browser": True
# "use_managed_browser": True
},
"screenshot": True,
"magic": True
"cache_mode": "bypass",
# "screenshot": True,
# "magic": True
}
async with session.post(
"http://localhost:11235/crawl",
json=crawl_request
json=crawl_request,
headers=headers
) as response:
task_data = await response.json()
task_id = task_data["task_id"]
# Check task status
async with session.get(
f"http://localhost:11235/task/{task_id}"
) as status_response:
result = await status_response.json()
print(f"Task result: {result}")
while True:
async with session.get(
f"http://localhost:11235/task/{task_id}",
headers=headers
) as status_response:
result = await status_response.json()
print(f"Task result: {result}")
if result["status"] == "completed":
break
else:
await asyncio.sleep(1)
# Main execution
async def main():
print("Running Crawl4AI feature examples...")
# print("Running Crawl4AI feature examples...")
print("\n1. Running Download Example:")
# print("\n1. Running Download Example:")
await download_example()
print("\n2. Running Content Filtering Example:")
# print("\n2. Running Content Filtering Example:")
await content_filtering_example()
print("\n3. Running Local and Raw HTML Example:")
# print("\n3. Running Local and Raw HTML Example:")
await local_and_raw_html_example()
print("\n4. Running Browser Management Example:")
# print("\n4. Running Browser Management Example:")
await browser_management_example()
print("\n5. Running API Example:")
# print("\n5. Running API Example:")
await api_example()
if __name__ == "__main__":

View File

@@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic
docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
```
## Running with Docker Compose 🐳
### Use Docker Compose (From Local Dockerfile or Docker Hub)
Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
### **Option 1: Using Docker Compose to Build Locally**
If you want to build the image locally, use the provided `docker-compose.local.yml` file.
```bash
docker-compose -f docker-compose.local.yml up -d
```
This will:
1. Build the Docker image from the provided `Dockerfile`.
2. Start the container and expose it on `http://localhost:11235`.
---
### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
```bash
docker-compose -f docker-compose.hub.yml up -d
```
This will:
1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
2. Start the container and expose it on `http://localhost:11235`.
---
### **Stopping the Running Services**
To stop the services started via Docker Compose, you can use:
```bash
docker-compose -f docker-compose.local.yml down
# OR
docker-compose -f docker-compose.hub.yml down
```
If the containers dont stop and the application is still running, check the running containers:
```bash
docker ps
```
Find the `CONTAINER ID` of the running service and stop it forcefully:
```bash
docker stop <CONTAINER_ID>
```
---
### **Debugging with Docker Compose**
- **Check Logs**: To view the container logs:
```bash
docker-compose -f docker-compose.local.yml logs -f
```
- **Remove Orphaned Containers**: If the service is still running unexpectedly:
```bash
docker-compose -f docker-compose.local.yml down --remove-orphans
```
- **Manually Remove Network**: If the network is still in use:
```bash
docker network ls
docker network rm crawl4ai_default
```
---
### Why Use Docker Compose?
Docker Compose is the recommended way to deploy Crawl4AI because:
1. It simplifies multi-container setups.
2. Allows you to define environment variables, resources, and ports in a single file.
3. Makes it easier to switch between local development and production-ready images.
For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
## API Security 🔒
### Understanding CRAWL4AI_API_TOKEN