* Fix: Use correct URL variable for raw HTML extraction (#1116) - Prevents full HTML content from being passed as URL to extraction strategies - Added unit tests to verify raw HTML and regular URL processing Fix: Wrong URL variable used for extraction of raw html * Fix #1181: Preserve whitespace in code blocks during HTML scraping The remove_empty_elements_fast() method was removing whitespace-only span elements inside <pre> and <code> tags, causing import statements like "import torch" to become "importtorch". Now skips elements inside code blocks where whitespace is significant. * Refactor Pydantic model configuration to use ConfigDict for arbitrary types * Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621 * Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638 * fix: ensure BrowserConfig.to_dict serializes proxy_config * feat: make LLM backoff configurable end-to-end - extend LLMConfig with backoff delay/attempt/factor fields and thread them through LLMExtractionStrategy, LLMContentFilter, table extraction, and Docker API handlers - expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff and document them in the md_v2 guides * reproduced AttributeError from #1642 * pass timeout parameter to docker client request * added missing deep crawling objects to init * generalized query in ContentRelevanceFilter to be a str or list * import modules from enhanceable deserialization * parameterized tests * Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268 * refactor: replace PyPDF2 with pypdf across the codebase. ref #1412 * announcement: add application form for cloud API closed beta * Release v0.7.8: Stability & Bug Fix Release - Updated version to 0.7.8 - Introduced focused stability release addressing 11 community-reported bugs. - Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates. - Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended. - Updated documentation to reflect recent changes and improvements. * docs: add section for Crawl4AI Cloud API closed beta with application link * fix: add disk cleanup step to Docker workflow --------- Co-authored-by: rbushria <rbushri@gmail.com> Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com> Co-authored-by: Soham Kukreti <kukretisoham@gmail.com> Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com> Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
101 lines
2.4 KiB
TOML
101 lines
2.4 KiB
TOML
[build-system]
|
|
requires = ["setuptools>=64.0.0", "wheel"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
name = "Crawl4AI"
|
|
dynamic = ["version"]
|
|
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
|
readme = "README.md"
|
|
requires-python = ">=3.10"
|
|
license = "Apache-2.0"
|
|
authors = [
|
|
{name = "Unclecode", email = "unclecode@kidocode.com"}
|
|
]
|
|
dependencies = [
|
|
"aiofiles>=24.1.0",
|
|
"aiohttp>=3.11.11",
|
|
"aiosqlite~=0.20",
|
|
"anyio>=4.0.0",
|
|
"lxml~=5.3",
|
|
"litellm>=1.53.1",
|
|
"numpy>=1.26.0,<3",
|
|
"pillow>=10.4",
|
|
"playwright>=1.49.0",
|
|
"patchright>=1.49.0",
|
|
"python-dotenv~=1.0",
|
|
"requests~=2.26",
|
|
"beautifulsoup4~=4.12",
|
|
"tf-playwright-stealth>=1.1.0",
|
|
"xxhash~=3.4",
|
|
"rank-bm25~=0.2",
|
|
"snowballstemmer~=2.2",
|
|
"pydantic>=2.10",
|
|
"pyOpenSSL>=25.3.0",
|
|
"psutil>=6.1.1",
|
|
"PyYAML>=6.0",
|
|
"nltk>=3.9.1",
|
|
"rich>=13.9.4",
|
|
"cssselect>=1.2.0",
|
|
"httpx>=0.27.2",
|
|
"httpx[http2]>=0.27.2",
|
|
"fake-useragent>=2.0.3",
|
|
"click>=8.1.7",
|
|
"chardet>=5.2.0",
|
|
"brotli>=1.1.0",
|
|
"humanize>=4.10.0",
|
|
"lark>=1.2.2",
|
|
"alphashape>=1.3.1",
|
|
"shapely>=2.0.0"
|
|
]
|
|
classifiers = [
|
|
"Development Status :: 4 - Beta",
|
|
"Intended Audience :: Developers",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3.10",
|
|
"Programming Language :: Python :: 3.11",
|
|
"Programming Language :: Python :: 3.12",
|
|
"Programming Language :: Python :: 3.13",
|
|
]
|
|
|
|
[project.optional-dependencies]
|
|
pdf = ["pypdf"]
|
|
torch = ["torch", "nltk", "scikit-learn"]
|
|
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
|
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
|
sync = ["selenium"]
|
|
all = [
|
|
"pypdf",
|
|
"torch",
|
|
"nltk",
|
|
"scikit-learn",
|
|
"transformers",
|
|
"tokenizers",
|
|
"sentence-transformers",
|
|
"selenium"
|
|
]
|
|
|
|
[project.scripts]
|
|
crawl4ai-download-models = "crawl4ai.model_loader:main"
|
|
crawl4ai-migrate = "crawl4ai.migrations:main"
|
|
crawl4ai-setup = "crawl4ai.install:post_install"
|
|
crawl4ai-doctor = "crawl4ai.install:doctor"
|
|
crwl = "crawl4ai.cli:main"
|
|
|
|
[tool.setuptools]
|
|
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
|
|
|
[tool.setuptools.package-data]
|
|
crawl4ai = ["js_snippet/*.js"]
|
|
|
|
[tool.setuptools.dynamic]
|
|
version = {attr = "crawl4ai.__version__.__version__"}
|
|
|
|
[tool.uv.sources]
|
|
crawl4ai = { workspace = true }
|
|
|
|
[dependency-groups]
|
|
dev = [
|
|
"crawl4ai",
|
|
]
|