diff --git a/README.md b/README.md index e8e6cddf..bbfa5858 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 1. Install Crawl4AI: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` 2. Run a simple web crawl: @@ -125,34 +126,6 @@ if __name__ == "__main__": ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) -## Features ✨ - -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. @@ -168,11 +141,12 @@ For basic web crawling and scraping tasks: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. -👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: +👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 1. Through the command line: diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0ccf13d8..cee7c25b 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode from .models import CrawlResult from .__version__ import __version__ -# __version__ = "0.3.73" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 8b69d491..4a938b75 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.745" +__version__ = "0.3.746" diff --git a/crawl4ai/install.py b/crawl4ai/install.py new file mode 100644 index 00000000..71fe30ea --- /dev/null +++ b/crawl4ai/install.py @@ -0,0 +1,44 @@ +import subprocess +import sys +import asyncio +from .async_logger import AsyncLogger, LogLevel + +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +def post_install(): + """Run all post-installation tasks""" + logger.info("Running post-installation setup...", tag="INIT") + install_playwright() + run_migration() + logger.success("Post-installation setup completed!", tag="COMPLETE") + +def install_playwright(): + logger.info("Installing Playwright browsers...", tag="INIT") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + logger.success("Playwright installation completed successfully.", tag="COMPLETE") + except subprocess.CalledProcessError as e: + logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + +def run_migration(): + """Initialize database during installation""" + try: + logger.info("Starting database initialization...", tag="INIT") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + logger.success("Database initialization completed successfully.", tag="COMPLETE") + except ImportError: + logger.warning("Database module not found. Will initialize on first use.") + except Exception as e: + logger.warning(f"Database initialization failed: {e}") + logger.warning("Database will be initialized on first use") \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py index 77616086..3386b0fb 100644 --- a/crawl4ai/migrations.py +++ b/crawl4ai/migrations.py @@ -9,9 +9,13 @@ import aiofiles import shutil import time from datetime import datetime +from .async_logger import AsyncLogger, LogLevel -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) class DatabaseMigration: def __init__(self, db_path: str): @@ -55,7 +59,8 @@ class DatabaseMigration: async def migrate_database(self): """Migrate existing database to file-based storage""" - logger.info("Starting database migration...") + # logger.info("Starting database migration...") + logger.info("Starting database migration...", tag="INIT") try: async with aiosqlite.connect(self.db_path) as db: @@ -91,19 +96,25 @@ class DatabaseMigration: migrated_count += 1 if migrated_count % 100 == 0: - logger.info(f"Migrated {migrated_count} records...") + logger.info(f"Migrated {migrated_count} records...", tag="INIT") + await db.commit() - logger.info(f"Migration completed. {migrated_count} records processed.") + logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE") except Exception as e: - logger.error(f"Migration failed: {e}") - raise + # logger.error(f"Migration failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def backup_database(db_path: str) -> str: """Create backup of existing database""" if not os.path.exists(db_path): - logger.info("No existing database found. Skipping backup.") + logger.info("No existing database found. Skipping backup.", tag="INIT") return None # Create backup with timestamp @@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str: # Create backup shutil.copy2(db_path, backup_path) - logger.info(f"Database backup created at: {backup_path}") + logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE") return backup_path except Exception as e: - logger.error(f"Backup failed: {e}") - raise + # logger.error(f"Backup failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def run_migration(db_path: Optional[str] = None): """Run database migration""" @@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None): db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") if not os.path.exists(db_path): - logger.info("No existing database found. Skipping migration.") + logger.info("No existing database found. Skipping migration.", tag="INIT") return # Create backup first diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 01f7677c..679a9bc2 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay(): async def main(): - # await simple_crawl() - # await simple_example_with_running_js_code() - # await simple_example_with_css_selector() + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() # await use_proxy() - # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - # await extract_structured_data_using_css_extractor() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy # custom_headers = { @@ -582,9 +582,9 @@ async def main(): # } # await extract_structured_data_using_llm(extra_headers=custom_headers) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() - # await crawl_dynamic_content_pages_method_3() + await crawl_dynamic_content_pages_method_1() + await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/requirements.txt b/requirements.txt index c0f6f183..741e12ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ aiosqlite~=0.20 -html2text~=2024.2 lxml~=5.3 litellm>=1.53.1 numpy>=1.26.0,<3 @@ -13,4 +12,5 @@ xxhash~=3.4 rank-bm25~=0.2 aiofiles>=24.1.0 colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file +snowballstemmer~=2.2 +pydantic>=2.10 \ No newline at end of file diff --git a/setup.py b/setup.py index d44169bf..e6840cd0 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,8 @@ from setuptools import setup, find_packages -from setuptools.command.install import install import os from pathlib import Path import shutil -import subprocess -import sys -import asyncio + # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder @@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - setup( name="Crawl4AI", version=version, @@ -116,7 +73,8 @@ setup( entry_points={ "console_scripts": [ "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + "crawl4ai-migrate=crawl4ai.migrations:main", + 'crawl4ai-setup=crawl4ai.install:post_install', ], }, classifiers=[ @@ -130,7 +88,4 @@ setup( "Programming Language :: Python :: 3.10", ], python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, )