Merge branch 'v0.4.243'

This commit is contained in:
UncleCode
2025-01-01 18:11:15 +08:00
8 changed files with 170 additions and 70 deletions

View File

@@ -20,9 +20,9 @@
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
[✨ Check out latest update v0.4.24](#-recent-updates)
[✨ Check out latest update v0.4.24x](#-recent-updates)
🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
## 🧐 Why Crawl4AI?
@@ -38,14 +38,18 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
1. Install Crawl4AI:
```bash
# Install the package
pip install crawl4ai
pip install -U crawl4ai
# Run post-installation setup
crawl4ai-setup
# Install Playwright with system dependencies (recommended)
playwright install --with-deps
# Verify your installation
crawl4ai-doctor
```
# Or install specific browsers:
playwright install --with-deps chrome # Recommended for Colab/Linux
If you encounter any browser-related issues, you can install them manually:
```bash
python -m playwright install --with-deps chromium
```
2. Run a simple web crawl:

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.4.24"
__version__ = "0.4.243"

View File

@@ -2,7 +2,6 @@ import subprocess
import sys
import asyncio
from .async_logger import AsyncLogger, LogLevel
from .docs_manager import DocsManager
# Initialize logger
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
@@ -12,24 +11,20 @@ def post_install():
logger.info("Running post-installation setup...", tag="INIT")
install_playwright()
run_migration()
asyncio.run(setup_docs())
logger.success("Post-installation setup completed!", tag="COMPLETE")
def install_playwright():
logger.info("Installing Playwright browsers...", tag="INIT")
try:
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
# subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
except subprocess.CalledProcessError as e:
logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
logger.warning(
"Please run 'python -m playwright install' manually after the installation."
)
# logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
except Exception as e:
logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
logger.warning(
"Please run 'python -m playwright install' manually after the installation."
)
# logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
def run_migration():
"""Initialize database during installation"""
@@ -45,7 +40,44 @@ def run_migration():
logger.warning(f"Database initialization failed: {e}")
logger.warning("Database will be initialized on first use")
async def setup_docs():
"""Download documentation files"""
docs_manager = DocsManager(logger)
await docs_manager.update_docs()
async def run_doctor():
"""Test if Crawl4AI is working properly"""
logger.info("Running Crawl4AI health check...", tag="INIT")
try:
from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
browser_config = BrowserConfig(
headless=True,
browser_type="chromium",
ignore_https_errors=True,
light_mode=True,
viewport_width=1280,
viewport_height=720
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
logger.info("Testing crawling capabilities...", tag="TEST")
result = await crawler.arun(
url="https://crawl4ai.com",
config=run_config
)
if result and result.markdown:
logger.success("✅ Crawling test passed!", tag="COMPLETE")
return True
else:
raise Exception("Failed to get content")
except Exception as e:
logger.error(f"❌ Test failed: {e}", tag="ERROR")
return False
def doctor():
"""Entry point for the doctor command"""
import asyncio
return asyncio.run(run_doctor())

View File

@@ -169,7 +169,8 @@ async def demo_content_filtering():
)
run_config = CrawlerRunConfig(
markdown_generator=markdown_gen
markdown_generator=markdown_gen,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
@@ -436,7 +437,7 @@ async def main():
await demo_ssl_features()
await demo_content_filtering()
await demo_json_extraction()
await demo_input_formats()
# await demo_input_formats()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -31,7 +31,14 @@ By the end of this guide, youll have installed Crawl4AI, performed a basic cr
```bash
pip install crawl4ai
crawl4ai-setup
playwright install --with-deps
# Verify your installation
crawl4ai-doctor
```
If you encounter any browser-related issues, you can install them manually:
```bash
python -m playwright install --with-deps chrome chromium
```
- **`crawl4ai-setup`** installs and configures Playwright (Chromium by default).

78
pyproject.toml Normal file
View File

@@ -0,0 +1,78 @@
[build-system]
requires = ["setuptools>=64.0.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "Crawl4AI"
dynamic = ["version"]
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
readme = "README.md"
requires-python = ">=3.9"
license = {text = "MIT"}
authors = [
{name = "Unclecode", email = "unclecode@kidocode.com"}
]
dependencies = [
"aiosqlite~=0.20",
"lxml~=5.3",
"litellm>=1.53.1",
"numpy>=1.26.0,<3",
"pillow~=10.4",
"playwright>=1.49.0",
"python-dotenv~=1.0",
"requests~=2.26",
"beautifulsoup4~=4.12",
"tf-playwright-stealth>=1.1.0",
"xxhash~=3.4",
"rank-bm25~=0.2",
"aiofiles>=24.1.0",
"colorama~=0.4",
"snowballstemmer~=2.2",
"pydantic>=2.10",
"pyOpenSSL>=24.3.0",
"psutil>=6.1.1",
"nltk>=3.9.1",
"playwright",
"aiofiles"
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
[project.optional-dependencies]
torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers"]
cosine = ["torch", "transformers", "nltk"]
sync = ["selenium"]
all = [
"torch",
"nltk",
"scikit-learn",
"transformers",
"tokenizers",
"selenium"
]
[project.scripts]
crawl4ai-download-models = "crawl4ai.model_loader:main"
crawl4ai-migrate = "crawl4ai.migrations:main"
crawl4ai-setup = "crawl4ai.install:post_install"
crawl4ai-doctor = "crawl4ai.install:doctor"
crawl = "crawl4ai.cli:cli"
[tool.setuptools]
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
[tool.setuptools.package-data]
crawl4ai = ["js_snippet/*.js"]
[tool.setuptools.dynamic]
version = {attr = "crawl4ai.__version__.__version__"}

View File

@@ -1,3 +1,5 @@
# Note: These requirements are also specified in pyproject.toml
# This file is kept for development environment setup and compatibility
aiosqlite~=0.20
lxml~=5.3
litellm>=1.53.1
@@ -14,4 +16,6 @@ aiofiles>=24.1.0
colorama~=0.4
snowballstemmer~=2.2
pydantic>=2.10
pyOpenSSL>=24.3.0
pyOpenSSL>=24.3.0
psutil>=6.1.1
nltk>=3.9.1

View File

@@ -3,6 +3,8 @@ import os
from pathlib import Path
import shutil
# Note: Most configuration is now in pyproject.toml
# This setup.py is kept for backwards compatibility
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
@@ -28,28 +30,20 @@ cache_folder.mkdir(exist_ok=True)
for folder in content_folders:
(crawl4ai_folder / folder).mkdir(exist_ok=True)
# Read requirements and version
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
with open(os.path.join(__location__, "requirements.txt")) as f:
requirements = f.read().splitlines()
with open("crawl4ai/__version__.py") as f:
for line in f:
if line.startswith("__version__"):
version = line.split("=")[1].strip().strip('"')
break
# Define requirements
default_requirements = requirements
torch_requirements = ["torch", "nltk", "scikit-learn"]
transformer_requirements = ["transformers", "tokenizers"]
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
sync_requirements = ["selenium"]
version = "0.0.0" # This will be overridden by pyproject.toml's dynamic version
try:
with open("crawl4ai/__version__.py") as f:
for line in f:
if line.startswith("__version__"):
version = line.split("=")[1].strip().strip('"')
break
except Exception:
pass # Let pyproject.toml handle version
setup(
name="Crawl4AI",
version=version,
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
url="https://github.com/unclecode/crawl4ai",
@@ -58,38 +52,18 @@ setup(
license="MIT",
packages=find_packages(),
package_data={
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
},
install_requires=default_requirements
+ ["playwright", "aiofiles"], # Added aiofiles
extras_require={
"torch": torch_requirements,
"transformer": transformer_requirements,
"cosine": cosine_similarity_requirements,
"sync": sync_requirements,
"all": default_requirements
+ torch_requirements
+ transformer_requirements
+ cosine_similarity_requirements
+ sync_requirements,
},
entry_points={
"console_scripts": [
"crawl4ai-download-models=crawl4ai.model_loader:main",
"crawl4ai-migrate=crawl4ai.migrations:main",
'crawl4ai-setup=crawl4ai.install:post_install',
'crawl=crawl4ai.cli:cli',
],
'crawl4ai': ['js_snippet/*.js']
},
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
],
python_requires=">=3.7",
python_requires=">=3.9",
)