Enhance installation and migration processes
- Added a post-installation setup script for initialization. - Updated README with installation notes for Playwright setup. - Enhanced migration logging for better error visibility. - Added 'pydantic' to requirements. - Bumped version to 0.3.746.
This commit is contained in:
32
README.md
32
README.md
@@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
|||||||
1. Install Crawl4AI:
|
1. Install Crawl4AI:
|
||||||
```bash
|
```bash
|
||||||
pip install crawl4ai
|
pip install crawl4ai
|
||||||
|
crawl4ai-setup # Setup the browser
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Run a simple web crawl:
|
2. Run a simple web crawl:
|
||||||
@@ -125,34 +126,6 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
|
✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
|
||||||
|
|
||||||
## Features ✨
|
|
||||||
|
|
||||||
- 🆓 Completely free and open-source
|
|
||||||
- 🚀 Blazing fast performance, outperforming many paid services
|
|
||||||
- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
|
|
||||||
- 🌐 Multi-browser support (Chromium, Firefox, WebKit)
|
|
||||||
- 🌍 Supports crawling multiple URLs simultaneously
|
|
||||||
- 🎨 Extracts and returns all media tags (Images, Audio, and Video)
|
|
||||||
- 🔗 Extracts all external and internal links
|
|
||||||
- 📚 Extracts metadata from the page
|
|
||||||
- 🔄 Custom hooks for authentication, headers, and page modifications
|
|
||||||
- 🕵️ User-agent customization
|
|
||||||
- 🖼️ Takes screenshots of pages with enhanced error handling
|
|
||||||
- 📜 Executes multiple custom JavaScripts before crawling
|
|
||||||
- 📊 Generates structured output without LLM using JsonCssExtractionStrategy
|
|
||||||
- 📚 Various chunking strategies: topic-based, regex, sentence, and more
|
|
||||||
- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more
|
|
||||||
- 🎯 CSS selector support for precise data extraction
|
|
||||||
- 📝 Passes instructions/keywords to refine extraction
|
|
||||||
- 🔒 Proxy support with authentication for enhanced access
|
|
||||||
- 🔄 Session management for complex multi-page crawling
|
|
||||||
- 🌐 Asynchronous architecture for improved performance
|
|
||||||
- 🖼️ Improved image processing with lazy-loading detection
|
|
||||||
- 🕰️ Enhanced handling of delayed content loading
|
|
||||||
- 🔑 Custom headers support for LLM interactions
|
|
||||||
- 🖼️ iframe content extraction for comprehensive analysis
|
|
||||||
- ⏱️ Flexible timeout and delayed content retrieval options
|
|
||||||
|
|
||||||
## Installation 🛠️
|
## Installation 🛠️
|
||||||
|
|
||||||
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
|
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
|
||||||
@@ -168,11 +141,12 @@ For basic web crawling and scraping tasks:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install crawl4ai
|
pip install crawl4ai
|
||||||
|
crawl4ai-setup # Setup the browser
|
||||||
```
|
```
|
||||||
|
|
||||||
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
|
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
|
||||||
|
|
||||||
👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
|
👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
|
||||||
|
|
||||||
1. Through the command line:
|
1. Through the command line:
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode
|
|||||||
|
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
# __version__ = "0.3.73"
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.3.745"
|
__version__ = "0.3.746"
|
||||||
|
|||||||
44
crawl4ai/install.py
Normal file
44
crawl4ai/install.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import asyncio
|
||||||
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
|
|
||||||
|
# Initialize logger
|
||||||
|
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
||||||
|
|
||||||
|
def post_install():
|
||||||
|
"""Run all post-installation tasks"""
|
||||||
|
logger.info("Running post-installation setup...", tag="INIT")
|
||||||
|
install_playwright()
|
||||||
|
run_migration()
|
||||||
|
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
||||||
|
|
||||||
|
def install_playwright():
|
||||||
|
logger.info("Installing Playwright browsers...", tag="INIT")
|
||||||
|
try:
|
||||||
|
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
||||||
|
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
|
||||||
|
logger.warning(
|
||||||
|
"Please run 'python -m playwright install' manually after the installation."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
|
||||||
|
logger.warning(
|
||||||
|
"Please run 'python -m playwright install' manually after the installation."
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_migration():
|
||||||
|
"""Initialize database during installation"""
|
||||||
|
try:
|
||||||
|
logger.info("Starting database initialization...", tag="INIT")
|
||||||
|
from crawl4ai.async_database import async_db_manager
|
||||||
|
|
||||||
|
asyncio.run(async_db_manager.initialize())
|
||||||
|
logger.success("Database initialization completed successfully.", tag="COMPLETE")
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("Database module not found. Will initialize on first use.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Database initialization failed: {e}")
|
||||||
|
logger.warning("Database will be initialized on first use")
|
||||||
@@ -9,9 +9,13 @@ import aiofiles
|
|||||||
import shutil
|
import shutil
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
# Initialize logger
|
||||||
logger = logging.getLogger(__name__)
|
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
||||||
|
|
||||||
|
# logging.basicConfig(level=logging.INFO)
|
||||||
|
# logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class DatabaseMigration:
|
class DatabaseMigration:
|
||||||
def __init__(self, db_path: str):
|
def __init__(self, db_path: str):
|
||||||
@@ -55,7 +59,8 @@ class DatabaseMigration:
|
|||||||
|
|
||||||
async def migrate_database(self):
|
async def migrate_database(self):
|
||||||
"""Migrate existing database to file-based storage"""
|
"""Migrate existing database to file-based storage"""
|
||||||
logger.info("Starting database migration...")
|
# logger.info("Starting database migration...")
|
||||||
|
logger.info("Starting database migration...", tag="INIT")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with aiosqlite.connect(self.db_path) as db:
|
async with aiosqlite.connect(self.db_path) as db:
|
||||||
@@ -91,19 +96,25 @@ class DatabaseMigration:
|
|||||||
|
|
||||||
migrated_count += 1
|
migrated_count += 1
|
||||||
if migrated_count % 100 == 0:
|
if migrated_count % 100 == 0:
|
||||||
logger.info(f"Migrated {migrated_count} records...")
|
logger.info(f"Migrated {migrated_count} records...", tag="INIT")
|
||||||
|
|
||||||
|
|
||||||
await db.commit()
|
await db.commit()
|
||||||
logger.info(f"Migration completed. {migrated_count} records processed.")
|
logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Migration failed: {e}")
|
# logger.error(f"Migration failed: {e}")
|
||||||
raise
|
logger.error(
|
||||||
|
message="Migration failed: {error}",
|
||||||
|
tag="ERROR",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
raise e
|
||||||
|
|
||||||
async def backup_database(db_path: str) -> str:
|
async def backup_database(db_path: str) -> str:
|
||||||
"""Create backup of existing database"""
|
"""Create backup of existing database"""
|
||||||
if not os.path.exists(db_path):
|
if not os.path.exists(db_path):
|
||||||
logger.info("No existing database found. Skipping backup.")
|
logger.info("No existing database found. Skipping backup.", tag="INIT")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Create backup with timestamp
|
# Create backup with timestamp
|
||||||
@@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str:
|
|||||||
|
|
||||||
# Create backup
|
# Create backup
|
||||||
shutil.copy2(db_path, backup_path)
|
shutil.copy2(db_path, backup_path)
|
||||||
logger.info(f"Database backup created at: {backup_path}")
|
logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
|
||||||
return backup_path
|
return backup_path
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Backup failed: {e}")
|
# logger.error(f"Backup failed: {e}")
|
||||||
raise
|
logger.error(
|
||||||
|
message="Migration failed: {error}",
|
||||||
|
tag="ERROR",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
raise e
|
||||||
|
|
||||||
async def run_migration(db_path: Optional[str] = None):
|
async def run_migration(db_path: Optional[str] = None):
|
||||||
"""Run database migration"""
|
"""Run database migration"""
|
||||||
@@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None):
|
|||||||
db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
|
db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
|
||||||
|
|
||||||
if not os.path.exists(db_path):
|
if not os.path.exists(db_path):
|
||||||
logger.info("No existing database found. Skipping migration.")
|
logger.info("No existing database found. Skipping migration.", tag="INIT")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Create backup first
|
# Create backup first
|
||||||
|
|||||||
@@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay():
|
|||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# await simple_crawl()
|
await simple_crawl()
|
||||||
# await simple_example_with_running_js_code()
|
await simple_example_with_running_js_code()
|
||||||
# await simple_example_with_css_selector()
|
await simple_example_with_css_selector()
|
||||||
# await use_proxy()
|
# await use_proxy()
|
||||||
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
||||||
# await extract_structured_data_using_css_extractor()
|
await extract_structured_data_using_css_extractor()
|
||||||
|
|
||||||
# LLM extraction examples
|
# LLM extraction examples
|
||||||
# await extract_structured_data_using_llm()
|
# await extract_structured_data_using_llm()
|
||||||
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||||
# await extract_structured_data_using_llm("ollama/llama3.2")
|
# await extract_structured_data_using_llm("ollama/llama3.2")
|
||||||
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
# You always can pass custom headers to the extraction strategy
|
# You always can pass custom headers to the extraction strategy
|
||||||
# custom_headers = {
|
# custom_headers = {
|
||||||
@@ -582,9 +582,9 @@ async def main():
|
|||||||
# }
|
# }
|
||||||
# await extract_structured_data_using_llm(extra_headers=custom_headers)
|
# await extract_structured_data_using_llm(extra_headers=custom_headers)
|
||||||
|
|
||||||
# await crawl_dynamic_content_pages_method_1()
|
await crawl_dynamic_content_pages_method_1()
|
||||||
# await crawl_dynamic_content_pages_method_2()
|
await crawl_dynamic_content_pages_method_2()
|
||||||
# await crawl_dynamic_content_pages_method_3()
|
await crawl_dynamic_content_pages_method_3()
|
||||||
|
|
||||||
await crawl_custom_browser_type()
|
await crawl_custom_browser_type()
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
aiosqlite~=0.20
|
aiosqlite~=0.20
|
||||||
html2text~=2024.2
|
|
||||||
lxml~=5.3
|
lxml~=5.3
|
||||||
litellm>=1.53.1
|
litellm>=1.53.1
|
||||||
numpy>=1.26.0,<3
|
numpy>=1.26.0,<3
|
||||||
@@ -13,4 +12,5 @@ xxhash~=3.4
|
|||||||
rank-bm25~=0.2
|
rank-bm25~=0.2
|
||||||
aiofiles>=24.1.0
|
aiofiles>=24.1.0
|
||||||
colorama~=0.4
|
colorama~=0.4
|
||||||
snowballstemmer~=2.2
|
snowballstemmer~=2.2
|
||||||
|
pydantic>=2.10
|
||||||
51
setup.py
51
setup.py
@@ -1,11 +1,8 @@
|
|||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
from setuptools.command.install import install
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
# If the folder already exists, remove the cache folder
|
# If the folder already exists, remove the cache folder
|
||||||
@@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"]
|
|||||||
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
|
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
|
||||||
sync_requirements = ["selenium"]
|
sync_requirements = ["selenium"]
|
||||||
|
|
||||||
|
|
||||||
def install_playwright():
|
|
||||||
print("Installing Playwright browsers...")
|
|
||||||
try:
|
|
||||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
|
||||||
print("Playwright installation completed successfully.")
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
print(f"Error during Playwright installation: {e}")
|
|
||||||
print(
|
|
||||||
"Please run 'python -m playwright install' manually after the installation."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Unexpected error during Playwright installation: {e}")
|
|
||||||
print(
|
|
||||||
"Please run 'python -m playwright install' manually after the installation."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def run_migration():
|
|
||||||
"""Initialize database during installation"""
|
|
||||||
try:
|
|
||||||
print("Starting database initialization...")
|
|
||||||
from crawl4ai.async_database import async_db_manager
|
|
||||||
|
|
||||||
asyncio.run(async_db_manager.initialize())
|
|
||||||
print("Database initialization completed successfully.")
|
|
||||||
except ImportError:
|
|
||||||
print("Warning: Database module not found. Will initialize on first use.")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: Database initialization failed: {e}")
|
|
||||||
print("Database will be initialized on first use")
|
|
||||||
|
|
||||||
|
|
||||||
class PostInstallCommand(install):
|
|
||||||
def run(self):
|
|
||||||
install.run(self)
|
|
||||||
install_playwright()
|
|
||||||
# run_migration()
|
|
||||||
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version=version,
|
version=version,
|
||||||
@@ -116,7 +73,8 @@ setup(
|
|||||||
entry_points={
|
entry_points={
|
||||||
"console_scripts": [
|
"console_scripts": [
|
||||||
"crawl4ai-download-models=crawl4ai.model_loader:main",
|
"crawl4ai-download-models=crawl4ai.model_loader:main",
|
||||||
"crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command
|
"crawl4ai-migrate=crawl4ai.migrations:main",
|
||||||
|
'crawl4ai-setup=crawl4ai.install:post_install',
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
classifiers=[
|
classifiers=[
|
||||||
@@ -130,7 +88,4 @@ setup(
|
|||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
],
|
],
|
||||||
python_requires=">=3.7",
|
python_requires=">=3.7",
|
||||||
cmdclass={
|
|
||||||
"install": PostInstallCommand,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user