Merge branch 'v0.4.243'
This commit is contained in:
18
README.md
18
README.md
@@ -20,9 +20,9 @@
|
|||||||
|
|
||||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
||||||
|
|
||||||
[✨ Check out latest update v0.4.24](#-recent-updates)
|
[✨ Check out latest update v0.4.24x](#-recent-updates)
|
||||||
|
|
||||||
🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
|
🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
|
||||||
|
|
||||||
## 🧐 Why Crawl4AI?
|
## 🧐 Why Crawl4AI?
|
||||||
|
|
||||||
@@ -38,14 +38,18 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
|||||||
1. Install Crawl4AI:
|
1. Install Crawl4AI:
|
||||||
```bash
|
```bash
|
||||||
# Install the package
|
# Install the package
|
||||||
pip install crawl4ai
|
pip install -U crawl4ai
|
||||||
|
|
||||||
|
# Run post-installation setup
|
||||||
crawl4ai-setup
|
crawl4ai-setup
|
||||||
|
|
||||||
# Install Playwright with system dependencies (recommended)
|
# Verify your installation
|
||||||
playwright install --with-deps
|
crawl4ai-doctor
|
||||||
|
```
|
||||||
|
|
||||||
# Or install specific browsers:
|
If you encounter any browser-related issues, you can install them manually:
|
||||||
playwright install --with-deps chrome # Recommended for Colab/Linux
|
```bash
|
||||||
|
python -m playwright install --with-deps chromium
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Run a simple web crawl:
|
2. Run a simple web crawl:
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.4.24"
|
__version__ = "0.4.243"
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
from .async_logger import AsyncLogger, LogLevel
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
from .docs_manager import DocsManager
|
|
||||||
|
|
||||||
# Initialize logger
|
# Initialize logger
|
||||||
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
|
||||||
@@ -12,24 +11,20 @@ def post_install():
|
|||||||
logger.info("Running post-installation setup...", tag="INIT")
|
logger.info("Running post-installation setup...", tag="INIT")
|
||||||
install_playwright()
|
install_playwright()
|
||||||
run_migration()
|
run_migration()
|
||||||
asyncio.run(setup_docs())
|
|
||||||
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
||||||
|
|
||||||
def install_playwright():
|
def install_playwright():
|
||||||
logger.info("Installing Playwright browsers...", tag="INIT")
|
logger.info("Installing Playwright browsers...", tag="INIT")
|
||||||
try:
|
try:
|
||||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
# subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
|
||||||
|
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
|
||||||
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
|
logger.success("Playwright installation completed successfully.", tag="COMPLETE")
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
|
# logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
|
||||||
logger.warning(
|
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
||||||
"Please run 'python -m playwright install' manually after the installation."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
|
# logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
|
||||||
logger.warning(
|
logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
|
||||||
"Please run 'python -m playwright install' manually after the installation."
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_migration():
|
def run_migration():
|
||||||
"""Initialize database during installation"""
|
"""Initialize database during installation"""
|
||||||
@@ -45,7 +40,44 @@ def run_migration():
|
|||||||
logger.warning(f"Database initialization failed: {e}")
|
logger.warning(f"Database initialization failed: {e}")
|
||||||
logger.warning("Database will be initialized on first use")
|
logger.warning("Database will be initialized on first use")
|
||||||
|
|
||||||
async def setup_docs():
|
async def run_doctor():
|
||||||
"""Download documentation files"""
|
"""Test if Crawl4AI is working properly"""
|
||||||
docs_manager = DocsManager(logger)
|
logger.info("Running Crawl4AI health check...", tag="INIT")
|
||||||
await docs_manager.update_docs()
|
try:
|
||||||
|
from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
browser_type="chromium",
|
||||||
|
ignore_https_errors=True,
|
||||||
|
light_mode=True,
|
||||||
|
viewport_width=1280,
|
||||||
|
viewport_height=720
|
||||||
|
)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
screenshot=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
logger.info("Testing crawling capabilities...", tag="TEST")
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://crawl4ai.com",
|
||||||
|
config=run_config
|
||||||
|
)
|
||||||
|
|
||||||
|
if result and result.markdown:
|
||||||
|
logger.success("✅ Crawling test passed!", tag="COMPLETE")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
raise Exception("Failed to get content")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Test failed: {e}", tag="ERROR")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def doctor():
|
||||||
|
"""Entry point for the doctor command"""
|
||||||
|
import asyncio
|
||||||
|
return asyncio.run(run_doctor())
|
||||||
|
|||||||
@@ -169,7 +169,8 @@ async def demo_content_filtering():
|
|||||||
)
|
)
|
||||||
|
|
||||||
run_config = CrawlerRunConfig(
|
run_config = CrawlerRunConfig(
|
||||||
markdown_generator=markdown_gen
|
markdown_generator=markdown_gen,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
@@ -436,7 +437,7 @@ async def main():
|
|||||||
await demo_ssl_features()
|
await demo_ssl_features()
|
||||||
await demo_content_filtering()
|
await demo_content_filtering()
|
||||||
await demo_json_extraction()
|
await demo_json_extraction()
|
||||||
await demo_input_formats()
|
# await demo_input_formats()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@@ -31,7 +31,14 @@ By the end of this guide, you’ll have installed Crawl4AI, performed a basic cr
|
|||||||
```bash
|
```bash
|
||||||
pip install crawl4ai
|
pip install crawl4ai
|
||||||
crawl4ai-setup
|
crawl4ai-setup
|
||||||
playwright install --with-deps
|
|
||||||
|
# Verify your installation
|
||||||
|
crawl4ai-doctor
|
||||||
|
```
|
||||||
|
|
||||||
|
If you encounter any browser-related issues, you can install them manually:
|
||||||
|
```bash
|
||||||
|
python -m playwright install --with-deps chrome chromium
|
||||||
```
|
```
|
||||||
|
|
||||||
- **`crawl4ai-setup`** installs and configures Playwright (Chromium by default).
|
- **`crawl4ai-setup`** installs and configures Playwright (Chromium by default).
|
||||||
|
|||||||
78
pyproject.toml
Normal file
78
pyproject.toml
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=64.0.0", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "Crawl4AI"
|
||||||
|
dynamic = ["version"]
|
||||||
|
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.9"
|
||||||
|
license = {text = "MIT"}
|
||||||
|
authors = [
|
||||||
|
{name = "Unclecode", email = "unclecode@kidocode.com"}
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"aiosqlite~=0.20",
|
||||||
|
"lxml~=5.3",
|
||||||
|
"litellm>=1.53.1",
|
||||||
|
"numpy>=1.26.0,<3",
|
||||||
|
"pillow~=10.4",
|
||||||
|
"playwright>=1.49.0",
|
||||||
|
"python-dotenv~=1.0",
|
||||||
|
"requests~=2.26",
|
||||||
|
"beautifulsoup4~=4.12",
|
||||||
|
"tf-playwright-stealth>=1.1.0",
|
||||||
|
"xxhash~=3.4",
|
||||||
|
"rank-bm25~=0.2",
|
||||||
|
"aiofiles>=24.1.0",
|
||||||
|
"colorama~=0.4",
|
||||||
|
"snowballstemmer~=2.2",
|
||||||
|
"pydantic>=2.10",
|
||||||
|
"pyOpenSSL>=24.3.0",
|
||||||
|
"psutil>=6.1.1",
|
||||||
|
"nltk>=3.9.1",
|
||||||
|
"playwright",
|
||||||
|
"aiofiles"
|
||||||
|
]
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
torch = ["torch", "nltk", "scikit-learn"]
|
||||||
|
transformer = ["transformers", "tokenizers"]
|
||||||
|
cosine = ["torch", "transformers", "nltk"]
|
||||||
|
sync = ["selenium"]
|
||||||
|
all = [
|
||||||
|
"torch",
|
||||||
|
"nltk",
|
||||||
|
"scikit-learn",
|
||||||
|
"transformers",
|
||||||
|
"tokenizers",
|
||||||
|
"selenium"
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
crawl4ai-download-models = "crawl4ai.model_loader:main"
|
||||||
|
crawl4ai-migrate = "crawl4ai.migrations:main"
|
||||||
|
crawl4ai-setup = "crawl4ai.install:post_install"
|
||||||
|
crawl4ai-doctor = "crawl4ai.install:doctor"
|
||||||
|
crawl = "crawl4ai.cli:cli"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
||||||
|
|
||||||
|
[tool.setuptools.package-data]
|
||||||
|
crawl4ai = ["js_snippet/*.js"]
|
||||||
|
|
||||||
|
[tool.setuptools.dynamic]
|
||||||
|
version = {attr = "crawl4ai.__version__.__version__"}
|
||||||
@@ -1,3 +1,5 @@
|
|||||||
|
# Note: These requirements are also specified in pyproject.toml
|
||||||
|
# This file is kept for development environment setup and compatibility
|
||||||
aiosqlite~=0.20
|
aiosqlite~=0.20
|
||||||
lxml~=5.3
|
lxml~=5.3
|
||||||
litellm>=1.53.1
|
litellm>=1.53.1
|
||||||
@@ -14,4 +16,6 @@ aiofiles>=24.1.0
|
|||||||
colorama~=0.4
|
colorama~=0.4
|
||||||
snowballstemmer~=2.2
|
snowballstemmer~=2.2
|
||||||
pydantic>=2.10
|
pydantic>=2.10
|
||||||
pyOpenSSL>=24.3.0
|
pyOpenSSL>=24.3.0
|
||||||
|
psutil>=6.1.1
|
||||||
|
nltk>=3.9.1
|
||||||
60
setup.py
60
setup.py
@@ -3,6 +3,8 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
# Note: Most configuration is now in pyproject.toml
|
||||||
|
# This setup.py is kept for backwards compatibility
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
# If the folder already exists, remove the cache folder
|
# If the folder already exists, remove the cache folder
|
||||||
@@ -28,28 +30,20 @@ cache_folder.mkdir(exist_ok=True)
|
|||||||
for folder in content_folders:
|
for folder in content_folders:
|
||||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||||
|
|
||||||
# Read requirements and version
|
version = "0.0.0" # This will be overridden by pyproject.toml's dynamic version
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
try:
|
||||||
with open(os.path.join(__location__, "requirements.txt")) as f:
|
with open("crawl4ai/__version__.py") as f:
|
||||||
requirements = f.read().splitlines()
|
for line in f:
|
||||||
|
if line.startswith("__version__"):
|
||||||
with open("crawl4ai/__version__.py") as f:
|
version = line.split("=")[1].strip().strip('"')
|
||||||
for line in f:
|
break
|
||||||
if line.startswith("__version__"):
|
except Exception:
|
||||||
version = line.split("=")[1].strip().strip('"')
|
pass # Let pyproject.toml handle version
|
||||||
break
|
|
||||||
|
|
||||||
# Define requirements
|
|
||||||
default_requirements = requirements
|
|
||||||
torch_requirements = ["torch", "nltk", "scikit-learn"]
|
|
||||||
transformer_requirements = ["transformers", "tokenizers"]
|
|
||||||
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
|
|
||||||
sync_requirements = ["selenium"]
|
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version=version,
|
version=version,
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
||||||
long_description=open("README.md", encoding="utf-8").read(),
|
long_description=open("README.md", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
url="https://github.com/unclecode/crawl4ai",
|
url="https://github.com/unclecode/crawl4ai",
|
||||||
@@ -58,38 +52,18 @@ setup(
|
|||||||
license="MIT",
|
license="MIT",
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
package_data={
|
package_data={
|
||||||
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
|
'crawl4ai': ['js_snippet/*.js']
|
||||||
},
|
|
||||||
install_requires=default_requirements
|
|
||||||
+ ["playwright", "aiofiles"], # Added aiofiles
|
|
||||||
extras_require={
|
|
||||||
"torch": torch_requirements,
|
|
||||||
"transformer": transformer_requirements,
|
|
||||||
"cosine": cosine_similarity_requirements,
|
|
||||||
"sync": sync_requirements,
|
|
||||||
"all": default_requirements
|
|
||||||
+ torch_requirements
|
|
||||||
+ transformer_requirements
|
|
||||||
+ cosine_similarity_requirements
|
|
||||||
+ sync_requirements,
|
|
||||||
},
|
|
||||||
entry_points={
|
|
||||||
"console_scripts": [
|
|
||||||
"crawl4ai-download-models=crawl4ai.model_loader:main",
|
|
||||||
"crawl4ai-migrate=crawl4ai.migrations:main",
|
|
||||||
'crawl4ai-setup=crawl4ai.install:post_install',
|
|
||||||
'crawl=crawl4ai.cli:cli',
|
|
||||||
],
|
|
||||||
},
|
},
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Development Status :: 3 - Alpha",
|
"Development Status :: 3 - Alpha",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
"License :: OSI Approved :: Apache Software License",
|
"License :: OSI Approved :: Apache Software License",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"Programming Language :: Python :: 3.7",
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
"Programming Language :: Python :: 3.9",
|
||||||
"Programming Language :: Python :: 3.10",
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
],
|
],
|
||||||
python_requires=">=3.7",
|
python_requires=">=3.9",
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user