Migrating from the classic setup.py to a using PyProject approach.
This commit is contained in:
@@ -1 +0,0 @@
|
|||||||
include requirements.txt
|
|
||||||
48
build_hooks.py
Normal file
48
build_hooks.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
|
||||||
|
PLUGIN = "CustomBuildHook"
|
||||||
|
|
||||||
|
class CustomBuildHook(BuildHookInterface):
|
||||||
|
def initialize(self, version, build_data):
|
||||||
|
# Create the .crawl4ai folder structure
|
||||||
|
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
|
||||||
|
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
|
||||||
|
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
|
||||||
|
cache_folder = crawl4ai_folder / "cache"
|
||||||
|
content_folders = [
|
||||||
|
"html_content",
|
||||||
|
"cleaned_html",
|
||||||
|
"markdown_content",
|
||||||
|
"extracted_content",
|
||||||
|
"screenshots",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Clean up old cache if exists
|
||||||
|
if cache_folder.exists():
|
||||||
|
shutil.rmtree(cache_folder)
|
||||||
|
|
||||||
|
# Create new folder structure
|
||||||
|
crawl4ai_folder.mkdir(exist_ok=True)
|
||||||
|
cache_folder.mkdir(exist_ok=True)
|
||||||
|
for folder in content_folders:
|
||||||
|
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Install Playwright browsers
|
||||||
|
try:
|
||||||
|
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Playwright installation failed: {e}")
|
||||||
|
print("Please run 'python -m playwright install' manually after installation")
|
||||||
|
|
||||||
|
# Initialize database
|
||||||
|
try:
|
||||||
|
from crawl4ai.async_database import async_db_manager
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(async_db_manager.initialize())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Database initialization failed: {e}")
|
||||||
|
print("Database will be initialized on first use")
|
||||||
@@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com")
|
|||||||
async def simple_crawl():
|
async def simple_crawl():
|
||||||
print("\n--- Basic Usage ---")
|
print("\n--- Basic Usage ---")
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
result = await crawler.arun(url="https://www.nbcnews.com/business")
|
result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
|
||||||
print(result.markdown[:500]) # Print first 500 characters
|
print(result.markdown[:500]) # Print first 500 characters
|
||||||
|
|
||||||
async def simple_example_with_running_js_code():
|
async def simple_example_with_running_js_code():
|
||||||
@@ -76,16 +76,17 @@ async def use_proxy():
|
|||||||
async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
|
async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://www.nbcnews.com/business",
|
url="https://www.nbcnews.com/business",
|
||||||
bypass_cache=True
|
cache_mode= CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
print(result.markdown[:500]) # Print first 500 characters
|
if result.success:
|
||||||
|
print(result.markdown[:500]) # Print first 500 characters
|
||||||
|
|
||||||
async def capture_and_save_screenshot(url: str, output_path: str):
|
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url=url,
|
url=url,
|
||||||
screenshot=True,
|
screenshot=True,
|
||||||
bypass_cache=True
|
cache_mode= CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success and result.screenshot:
|
if result.success and result.screenshot:
|
||||||
@@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
|
|||||||
async def extract_structured_data_using_css_extractor():
|
async def extract_structured_data_using_css_extractor():
|
||||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||||
schema = {
|
schema = {
|
||||||
"name": "Coinbase Crypto Prices",
|
"name": "KidoCode Courses",
|
||||||
"baseSelector": ".cds-tableRow-t45thuk",
|
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"name": "crypto",
|
"name": "section_title",
|
||||||
"selector": "td:nth-child(1) h2",
|
"selector": "h3.heading-50",
|
||||||
"type": "text",
|
"type": "text",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "symbol",
|
"name": "section_description",
|
||||||
"selector": "td:nth-child(1) p",
|
"selector": ".charge-content",
|
||||||
"type": "text",
|
"type": "text",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "price",
|
"name": "course_name",
|
||||||
"selector": "td:nth-child(2)",
|
"selector": ".text-block-93",
|
||||||
"type": "text",
|
"type": "text",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "course_description",
|
||||||
|
"selector": ".course-content-text",
|
||||||
|
"type": "text",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "course_icon",
|
||||||
|
"selector": ".image-92",
|
||||||
|
"type": "attribute",
|
||||||
|
"attribute": "src"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(
|
||||||
|
headless=True,
|
||||||
|
verbose=True
|
||||||
|
) as crawler:
|
||||||
|
|
||||||
|
# Create the JavaScript that handles clicking multiple times
|
||||||
|
js_click_tabs = """
|
||||||
|
(async () => {
|
||||||
|
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
|
||||||
|
|
||||||
|
for(let tab of tabs) {
|
||||||
|
// scroll to the tab
|
||||||
|
tab.scrollIntoView();
|
||||||
|
tab.click();
|
||||||
|
// Wait for content to load and animations to complete
|
||||||
|
await new Promise(r => setTimeout(r, 500));
|
||||||
}
|
}
|
||||||
],
|
})();
|
||||||
}
|
"""
|
||||||
|
|
||||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://www.coinbase.com/explore",
|
url="https://www.kidocode.com/degrees/technology",
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
|
||||||
cache_mode=CacheMode.BYPASS,
|
js_code=[js_click_tabs],
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result.success, "Failed to crawl the page"
|
companies = json.loads(result.extracted_content)
|
||||||
|
print(f"Successfully extracted {len(companies)} companies")
|
||||||
news_teasers = json.loads(result.extracted_content)
|
print(json.dumps(companies[0], indent=2))
|
||||||
print(f"Successfully extracted {len(news_teasers)} news teasers")
|
|
||||||
print(json.dumps(news_teasers[0], indent=2))
|
|
||||||
|
|
||||||
# Advanced Session-Based Crawling with Dynamic Content 🔄
|
# Advanced Session-Based Crawling with Dynamic Content 🔄
|
||||||
async def crawl_dynamic_content_pages_method_1():
|
async def crawl_dynamic_content_pages_method_1():
|
||||||
@@ -363,21 +391,21 @@ async def crawl_custom_browser_type():
|
|||||||
# Use Firefox
|
# Use Firefox
|
||||||
start = time.time()
|
start = time.time()
|
||||||
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
|
async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
|
||||||
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||||
print(result.markdown[:500])
|
print(result.markdown[:500])
|
||||||
print("Time taken: ", time.time() - start)
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
# Use WebKit
|
# Use WebKit
|
||||||
start = time.time()
|
start = time.time()
|
||||||
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
|
async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
|
||||||
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||||
print(result.markdown[:500])
|
print(result.markdown[:500])
|
||||||
print("Time taken: ", time.time() - start)
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
# Use Chromium (default)
|
# Use Chromium (default)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
|
async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
|
||||||
result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
|
result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
|
||||||
print(result.markdown[:500])
|
print(result.markdown[:500])
|
||||||
print("Time taken: ", time.time() - start)
|
print("Time taken: ", time.time() - start)
|
||||||
|
|
||||||
@@ -534,29 +562,29 @@ async def fit_markdown_remove_overlay():
|
|||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
await simple_crawl()
|
# await simple_crawl()
|
||||||
await simple_example_with_running_js_code()
|
# await simple_example_with_running_js_code()
|
||||||
await simple_example_with_css_selector()
|
# await simple_example_with_css_selector()
|
||||||
await use_proxy()
|
# await use_proxy()
|
||||||
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
||||||
await extract_structured_data_using_css_extractor()
|
# await extract_structured_data_using_css_extractor()
|
||||||
|
|
||||||
# LLM extraction examples
|
# LLM extraction examples
|
||||||
# await extract_structured_data_using_llm()
|
# await extract_structured_data_using_llm()
|
||||||
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
|
||||||
# await extract_structured_data_using_llm("ollama/llama3.2")
|
# await extract_structured_data_using_llm("ollama/llama3.2")
|
||||||
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
# You always can pass custom headers to the extraction strategy
|
# You always can pass custom headers to the extraction strategy
|
||||||
custom_headers = {
|
# custom_headers = {
|
||||||
"Authorization": "Bearer your-custom-token",
|
# "Authorization": "Bearer your-custom-token",
|
||||||
"X-Custom-Header": "Some-Value"
|
# "X-Custom-Header": "Some-Value"
|
||||||
}
|
# }
|
||||||
await extract_structured_data_using_llm(extra_headers=custom_headers)
|
# await extract_structured_data_using_llm(extra_headers=custom_headers)
|
||||||
|
|
||||||
# await crawl_dynamic_content_pages_method_1()
|
# await crawl_dynamic_content_pages_method_1()
|
||||||
# await crawl_dynamic_content_pages_method_2()
|
# await crawl_dynamic_content_pages_method_2()
|
||||||
await crawl_dynamic_content_pages_method_3()
|
# await crawl_dynamic_content_pages_method_3()
|
||||||
|
|
||||||
await crawl_custom_browser_type()
|
await crawl_custom_browser_type()
|
||||||
|
|
||||||
|
|||||||
9
plugin.py
Normal file
9
plugin.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from colorama import Fore, Style
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def post_install():
|
||||||
|
print(f"\n{Fore.YELLOW}{'='*40}")
|
||||||
|
print(f"{Fore.RED}IMPORTANT: Run this command now:")
|
||||||
|
print(f"{Fore.GREEN}python -m playwright install")
|
||||||
|
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
|
||||||
19
post_install.py
Normal file
19
post_install.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
from colorama import Fore, Style
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import distutils.log as log
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
subprocess.check_call([sys.executable, "-m", "playwright", "install"],
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL)
|
||||||
|
except:
|
||||||
|
print(f"\n{Fore.YELLOW}{'='*40}")
|
||||||
|
print(f"{Fore.RED}IMPORTANT: Run this command now:")
|
||||||
|
print(f"{Fore.GREEN}python -m playwright install")
|
||||||
|
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
75
pyproject.toml
Normal file
75
pyproject.toml
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "Crawl4AI"
|
||||||
|
dynamic = ["version"]
|
||||||
|
description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||||
|
readme = "README.md"
|
||||||
|
license = "Apache-2.0"
|
||||||
|
requires-python = ">=3.7"
|
||||||
|
authors = [
|
||||||
|
{ name = "Unclecode", email = "unclecode@kidocode.com" },
|
||||||
|
]
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.7",
|
||||||
|
"Programming Language :: Python :: 3.8",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
]
|
||||||
|
dependencies = [
|
||||||
|
"aiosqlite~=0.20",
|
||||||
|
"html2text~=2024.2",
|
||||||
|
"lxml~=5.3",
|
||||||
|
"litellm>=1.53.1",
|
||||||
|
"numpy>=1.26.0,<3",
|
||||||
|
"pillow~=10.4",
|
||||||
|
"playwright>=1.49.0",
|
||||||
|
"python-dotenv~=1.0",
|
||||||
|
"requests~=2.26",
|
||||||
|
"beautifulsoup4~=4.12",
|
||||||
|
"tf-playwright-stealth>=1.1.0",
|
||||||
|
"xxhash~=3.4",
|
||||||
|
"rank-bm25~=0.2",
|
||||||
|
"aiofiles>=24.1.0",
|
||||||
|
"colorama~=0.4",
|
||||||
|
"snowballstemmer~=2.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
torch = ["torch", "nltk", "scikit-learn"]
|
||||||
|
transformer = ["transformers", "tokenizers"]
|
||||||
|
cosine = ["torch", "transformers", "nltk"]
|
||||||
|
sync = ["selenium"]
|
||||||
|
all = [
|
||||||
|
"torch",
|
||||||
|
"nltk",
|
||||||
|
"scikit-learn",
|
||||||
|
"transformers",
|
||||||
|
"tokenizers",
|
||||||
|
"selenium",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Homepage = "https://github.com/unclecode/crawl4ai"
|
||||||
|
Documentation = "https://crawl4ai.com/mkdocs/"
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
crawl4ai-download-models = "crawl4ai.model_loader:main"
|
||||||
|
crawl4ai-migrate = "crawl4ai.migrations:main"
|
||||||
|
crawl4ai-post-install = "crawl4ai.post_install:main"
|
||||||
|
|
||||||
|
[tool.hatch.version]
|
||||||
|
path = "crawl4ai/__version__.py"
|
||||||
|
|
||||||
|
[tool.hatch.build.hooks.custom]
|
||||||
|
dependencies = ["hatch-fancy-pypi-readme>=22.5.0"]
|
||||||
|
path = "build_hooks.py"
|
||||||
|
|
||||||
|
[project.entry-points.hatch]
|
||||||
|
crawl4ai = "crawl4ai.plugin:post_install"
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
aiosqlite~=0.20
|
|
||||||
html2text~=2024.2
|
|
||||||
lxml~=5.3
|
|
||||||
litellm>=1.53.1
|
|
||||||
numpy>=1.26.0,<3
|
|
||||||
pillow~=10.4
|
|
||||||
playwright>=1.49.0
|
|
||||||
python-dotenv~=1.0
|
|
||||||
requests~=2.26
|
|
||||||
beautifulsoup4~=4.12
|
|
||||||
tf-playwright-stealth>=1.1.0
|
|
||||||
xxhash~=3.4
|
|
||||||
rank-bm25~=0.2
|
|
||||||
aiofiles>=24.1.0
|
|
||||||
colorama~=0.4
|
|
||||||
snowballstemmer~=2.2
|
|
||||||
136
setup.py
136
setup.py
@@ -1,136 +0,0 @@
|
|||||||
from setuptools import setup, find_packages
|
|
||||||
from setuptools.command.install import install
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
import shutil
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
|
||||||
# If the folder already exists, remove the cache folder
|
|
||||||
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
|
|
||||||
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
|
|
||||||
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
|
|
||||||
cache_folder = crawl4ai_folder / "cache"
|
|
||||||
content_folders = [
|
|
||||||
"html_content",
|
|
||||||
"cleaned_html",
|
|
||||||
"markdown_content",
|
|
||||||
"extracted_content",
|
|
||||||
"screenshots",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Clean up old cache if exists
|
|
||||||
if cache_folder.exists():
|
|
||||||
shutil.rmtree(cache_folder)
|
|
||||||
|
|
||||||
# Create new folder structure
|
|
||||||
crawl4ai_folder.mkdir(exist_ok=True)
|
|
||||||
cache_folder.mkdir(exist_ok=True)
|
|
||||||
for folder in content_folders:
|
|
||||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
# Read requirements and version
|
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
||||||
with open(os.path.join(__location__, "requirements.txt")) as f:
|
|
||||||
requirements = f.read().splitlines()
|
|
||||||
|
|
||||||
with open("crawl4ai/__version__.py") as f:
|
|
||||||
for line in f:
|
|
||||||
if line.startswith("__version__"):
|
|
||||||
version = line.split("=")[1].strip().strip('"')
|
|
||||||
break
|
|
||||||
|
|
||||||
# Define requirements
|
|
||||||
default_requirements = requirements
|
|
||||||
torch_requirements = ["torch", "nltk", "scikit-learn"]
|
|
||||||
transformer_requirements = ["transformers", "tokenizers"]
|
|
||||||
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
|
|
||||||
sync_requirements = ["selenium"]
|
|
||||||
|
|
||||||
|
|
||||||
def install_playwright():
|
|
||||||
print("Installing Playwright browsers...")
|
|
||||||
try:
|
|
||||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
|
||||||
print("Playwright installation completed successfully.")
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
print(f"Error during Playwright installation: {e}")
|
|
||||||
print(
|
|
||||||
"Please run 'python -m playwright install' manually after the installation."
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Unexpected error during Playwright installation: {e}")
|
|
||||||
print(
|
|
||||||
"Please run 'python -m playwright install' manually after the installation."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def run_migration():
|
|
||||||
"""Initialize database during installation"""
|
|
||||||
try:
|
|
||||||
print("Starting database initialization...")
|
|
||||||
from crawl4ai.async_database import async_db_manager
|
|
||||||
|
|
||||||
asyncio.run(async_db_manager.initialize())
|
|
||||||
print("Database initialization completed successfully.")
|
|
||||||
except ImportError:
|
|
||||||
print("Warning: Database module not found. Will initialize on first use.")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: Database initialization failed: {e}")
|
|
||||||
print("Database will be initialized on first use")
|
|
||||||
|
|
||||||
|
|
||||||
class PostInstallCommand(install):
|
|
||||||
def run(self):
|
|
||||||
install.run(self)
|
|
||||||
install_playwright()
|
|
||||||
# run_migration()
|
|
||||||
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name="Crawl4AI",
|
|
||||||
version=version,
|
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
|
||||||
long_description=open("README.md", encoding="utf-8").read(),
|
|
||||||
long_description_content_type="text/markdown",
|
|
||||||
url="https://github.com/unclecode/crawl4ai",
|
|
||||||
author="Unclecode",
|
|
||||||
author_email="unclecode@kidocode.com",
|
|
||||||
license="MIT",
|
|
||||||
packages=find_packages(),
|
|
||||||
install_requires=default_requirements
|
|
||||||
+ ["playwright", "aiofiles"], # Added aiofiles
|
|
||||||
extras_require={
|
|
||||||
"torch": torch_requirements,
|
|
||||||
"transformer": transformer_requirements,
|
|
||||||
"cosine": cosine_similarity_requirements,
|
|
||||||
"sync": sync_requirements,
|
|
||||||
"all": default_requirements
|
|
||||||
+ torch_requirements
|
|
||||||
+ transformer_requirements
|
|
||||||
+ cosine_similarity_requirements
|
|
||||||
+ sync_requirements,
|
|
||||||
},
|
|
||||||
entry_points={
|
|
||||||
"console_scripts": [
|
|
||||||
"crawl4ai-download-models=crawl4ai.model_loader:main",
|
|
||||||
"crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command
|
|
||||||
],
|
|
||||||
},
|
|
||||||
classifiers=[
|
|
||||||
"Development Status :: 3 - Alpha",
|
|
||||||
"Intended Audience :: Developers",
|
|
||||||
"License :: OSI Approved :: Apache Software License",
|
|
||||||
"Programming Language :: Python :: 3",
|
|
||||||
"Programming Language :: Python :: 3.7",
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Programming Language :: Python :: 3.9",
|
|
||||||
"Programming Language :: Python :: 3.10",
|
|
||||||
],
|
|
||||||
python_requires=">=3.7",
|
|
||||||
cmdclass={
|
|
||||||
"install": PostInstallCommand,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
Reference in New Issue
Block a user