Merge branch 'v0.4.243'
This commit is contained in:
18
README.md
18
README.md
@@ -20,9 +20,9 @@
|
||||
|
||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
||||
|
||||
[✨ Check out latest update v0.4.24](#-recent-updates)
|
||||
[✨ Check out latest update v0.4.24x](#-recent-updates)
|
||||
|
||||
🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
|
||||
🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
|
||||
|
||||
## 🧐 Why Crawl4AI?
|
||||
|
||||
@@ -38,14 +38,18 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
||||
1. Install Crawl4AI:
|
||||
```bash
|
||||
# Install the package
|
||||
pip install crawl4ai
|
||||
pip install -U crawl4ai
|
||||
|
||||
# Run post-installation setup
|
||||
crawl4ai-setup
|
||||
|
||||
# Install Playwright with system dependencies (recommended)
|
||||
playwright install --with-deps
|
||||
# Verify your installation
|
||||
crawl4ai-doctor
|
||||
```
|
||||
|
||||
# Or install specific browsers:
|
||||
playwright install --with-deps chrome # Recommended for Colab/Linux
|
||||
If you encounter any browser-related issues, you can install them manually:
|
||||
```bash
|
||||
python -m playwright install --with-deps chromium
|
||||
```
|
||||
|
||||
2. Run a simple web crawl:
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.4.24"
|
||||
__version__ = "0.4.243"
|
||||
|
||||
@@ -2,7 +2,6 @@ import subprocess
|
||||
import sys
|
||||
import asyncio
|
||||
from .async_logger import AsyncLogger, LogLevel
|
||||
from .docs_manager import DocsManager
|
||||
|
||||
# Initialize logger
|
||||
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
||||
@@ -12,24 +11,20 @@ def post_install():
|
||||
logger.info("Running post-installation setup...", tag="INIT")
|
||||
install_playwright()
|
||||
run_migration()
|
||||
asyncio.run(setup_docs())
|
||||
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
||||
|
||||
def install_playwright():
|
||||
logger.info("Installing Playwright browsers...", tag="INIT")
|
||||
try:
|
||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
||||
# subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
|
||||
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
|
||||
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
|
||||
logger.warning(
|
||||
"Please run 'python -m playwright install' manually after the installation."
|
||||
)
|
||||
# logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
|
||||
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
|
||||
logger.warning(
|
||||
"Please run 'python -m playwright install' manually after the installation."
|
||||
)
|
||||
# logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
|
||||
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
||||
|
||||
def run_migration():
|
||||
"""Initialize database during installation"""
|
||||
@@ -45,7 +40,44 @@ def run_migration():
|
||||
logger.warning(f"Database initialization failed: {e}")
|
||||
logger.warning("Database will be initialized on first use")
|
||||
|
||||
async def setup_docs():
|
||||
"""Download documentation files"""
|
||||
docs_manager = DocsManager(logger)
|
||||
await docs_manager.update_docs()
|
||||
async def run_doctor():
|
||||
"""Test if Crawl4AI is working properly"""
|
||||
logger.info("Running Crawl4AI health check...", tag="INIT")
|
||||
try:
|
||||
from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_type="chromium",
|
||||
ignore_https_errors=True,
|
||||
light_mode=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=720
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
logger.info("Testing crawling capabilities...", tag="TEST")
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com",
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result and result.markdown:
|
||||
logger.success("✅ Crawling test passed!", tag="COMPLETE")
|
||||
return True
|
||||
else:
|
||||
raise Exception("Failed to get content")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Test failed: {e}", tag="ERROR")
|
||||
return False
|
||||
|
||||
def doctor():
|
||||
"""Entry point for the doctor command"""
|
||||
import asyncio
|
||||
return asyncio.run(run_doctor())
|
||||
|
||||
@@ -169,7 +169,8 @@ async def demo_content_filtering():
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=markdown_gen
|
||||
markdown_generator=markdown_gen,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -436,7 +437,7 @@ async def main():
|
||||
await demo_ssl_features()
|
||||
await demo_content_filtering()
|
||||
await demo_json_extraction()
|
||||
await demo_input_formats()
|
||||
# await demo_input_formats()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -31,7 +31,14 @@ By the end of this guide, you’ll have installed Crawl4AI, performed a basic cr
|
||||
```bash
|
||||
pip install crawl4ai
|
||||
crawl4ai-setup
|
||||
playwright install --with-deps
|
||||
|
||||
# Verify your installation
|
||||
crawl4ai-doctor
|
||||
```
|
||||
|
||||
If you encounter any browser-related issues, you can install them manually:
|
||||
```bash
|
||||
python -m playwright install --with-deps chrome chromium
|
||||
```
|
||||
|
||||
- **`crawl4ai-setup`** installs and configures Playwright (Chromium by default).
|
||||
|
||||
78
pyproject.toml
Normal file
78
pyproject.toml
Normal file
@@ -0,0 +1,78 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=64.0.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "Crawl4AI"
|
||||
dynamic = ["version"]
|
||||
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
{name = "Unclecode", email = "unclecode@kidocode.com"}
|
||||
]
|
||||
dependencies = [
|
||||
"aiosqlite~=0.20",
|
||||
"lxml~=5.3",
|
||||
"litellm>=1.53.1",
|
||||
"numpy>=1.26.0,<3",
|
||||
"pillow~=10.4",
|
||||
"playwright>=1.49.0",
|
||||
"python-dotenv~=1.0",
|
||||
"requests~=2.26",
|
||||
"beautifulsoup4~=4.12",
|
||||
"tf-playwright-stealth>=1.1.0",
|
||||
"xxhash~=3.4",
|
||||
"rank-bm25~=0.2",
|
||||
"aiofiles>=24.1.0",
|
||||
"colorama~=0.4",
|
||||
"snowballstemmer~=2.2",
|
||||
"pydantic>=2.10",
|
||||
"pyOpenSSL>=24.3.0",
|
||||
"psutil>=6.1.1",
|
||||
"nltk>=3.9.1",
|
||||
"playwright",
|
||||
"aiofiles"
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
torch = ["torch", "nltk", "scikit-learn"]
|
||||
transformer = ["transformers", "tokenizers"]
|
||||
cosine = ["torch", "transformers", "nltk"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"torch",
|
||||
"nltk",
|
||||
"scikit-learn",
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"selenium"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
crawl4ai-download-models = "crawl4ai.model_loader:main"
|
||||
crawl4ai-migrate = "crawl4ai.migrations:main"
|
||||
crawl4ai-setup = "crawl4ai.install:post_install"
|
||||
crawl4ai-doctor = "crawl4ai.install:doctor"
|
||||
crawl = "crawl4ai.cli:cli"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
crawl4ai = ["js_snippet/*.js"]
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {attr = "crawl4ai.__version__.__version__"}
|
||||
@@ -1,3 +1,5 @@
|
||||
# Note: These requirements are also specified in pyproject.toml
|
||||
# This file is kept for development environment setup and compatibility
|
||||
aiosqlite~=0.20
|
||||
lxml~=5.3
|
||||
litellm>=1.53.1
|
||||
@@ -14,4 +16,6 @@ aiofiles>=24.1.0
|
||||
colorama~=0.4
|
||||
snowballstemmer~=2.2
|
||||
pydantic>=2.10
|
||||
pyOpenSSL>=24.3.0
|
||||
pyOpenSSL>=24.3.0
|
||||
psutil>=6.1.1
|
||||
nltk>=3.9.1
|
||||
60
setup.py
60
setup.py
@@ -3,6 +3,8 @@ import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
|
||||
# Note: Most configuration is now in pyproject.toml
|
||||
# This setup.py is kept for backwards compatibility
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
# If the folder already exists, remove the cache folder
|
||||
@@ -28,28 +30,20 @@ cache_folder.mkdir(exist_ok=True)
|
||||
for folder in content_folders:
|
||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||
|
||||
# Read requirements and version
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
with open(os.path.join(__location__, "requirements.txt")) as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
with open("crawl4ai/__version__.py") as f:
|
||||
for line in f:
|
||||
if line.startswith("__version__"):
|
||||
version = line.split("=")[1].strip().strip('"')
|
||||
break
|
||||
|
||||
# Define requirements
|
||||
default_requirements = requirements
|
||||
torch_requirements = ["torch", "nltk", "scikit-learn"]
|
||||
transformer_requirements = ["transformers", "tokenizers"]
|
||||
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
|
||||
sync_requirements = ["selenium"]
|
||||
version = "0.0.0" # This will be overridden by pyproject.toml's dynamic version
|
||||
try:
|
||||
with open("crawl4ai/__version__.py") as f:
|
||||
for line in f:
|
||||
if line.startswith("__version__"):
|
||||
version = line.split("=")[1].strip().strip('"')
|
||||
break
|
||||
except Exception:
|
||||
pass # Let pyproject.toml handle version
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version=version,
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
||||
description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
||||
long_description=open("README.md", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/unclecode/crawl4ai",
|
||||
@@ -58,38 +52,18 @@ setup(
|
||||
license="MIT",
|
||||
packages=find_packages(),
|
||||
package_data={
|
||||
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
|
||||
},
|
||||
install_requires=default_requirements
|
||||
+ ["playwright", "aiofiles"], # Added aiofiles
|
||||
extras_require={
|
||||
"torch": torch_requirements,
|
||||
"transformer": transformer_requirements,
|
||||
"cosine": cosine_similarity_requirements,
|
||||
"sync": sync_requirements,
|
||||
"all": default_requirements
|
||||
+ torch_requirements
|
||||
+ transformer_requirements
|
||||
+ cosine_similarity_requirements
|
||||
+ sync_requirements,
|
||||
},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"crawl4ai-download-models=crawl4ai.model_loader:main",
|
||||
"crawl4ai-migrate=crawl4ai.migrations:main",
|
||||
'crawl4ai-setup=crawl4ai.install:post_install',
|
||||
'crawl=crawl4ai.cli:cli',
|
||||
],
|
||||
'crawl4ai': ['js_snippet/*.js']
|
||||
},
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
],
|
||||
python_requires=">=3.7",
|
||||
python_requires=">=3.9",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user