Compare commits

..

16 Commits

Author SHA1 Message Date
UncleCode
48647300b4 chore: Bump version to 0.7.2 2025-07-25 17:42:48 +08:00
UncleCode
9f9ea3bb3b chore: Clean up test artifacts and disable test workflow 2025-07-25 17:31:52 +08:00
UncleCode
d58b93c207 fix: Re-enable multi-platform Docker builds for ARM64 support 2025-07-25 16:38:11 +08:00
UncleCode
e2b4705010 fix: Use hardcoded Docker repository name to avoid masking issues 2025-07-25 15:52:26 +08:00
UncleCode
4a1abd5086 fix: Handle existing version on Test PyPI gracefully 2025-07-25 15:41:16 +08:00
UncleCode
04258cd4f2 fix: Speed up Docker test builds by using single platform and caching 2025-07-25 15:37:44 +08:00
UncleCode
84e462d9f8 Merge remote-tracking branch 'origin/develop' 2025-07-25 15:35:53 +08:00
UncleCode
9546773a07 fix: Move sentence-transformers to optional dependencies
- Moved sentence-transformers from core to optional dependencies in pyproject.toml
- Removed sentence-transformers from requirements.txt
- Added proper ImportError handling with helpful installation message
- This prevents ~2.5GB of NVIDIA CUDA libraries from being installed by default
- Users who need embedding features can install with: pip install 'crawl4ai[transformer]'
2025-07-24 21:24:40 +08:00
UncleCode
66a979ad11 fix: Install dependencies before version check in workflows 2025-07-24 21:01:36 +08:00
UncleCode
0c31e91b53 feat: Add CI/CD workflows for automated PyPI and Docker releases 2025-07-24 20:58:43 +08:00
ntohidi
1b6a31f88f fix: encode PDF results to base64 in /crawl endpoint. ref #1301 2025-07-23 13:52:18 +02:00
Nasrin
b8c261780f Merge pull request #1319 from volumetric/fix_for_bug_#1310
Removed the incorrect reference in browser_config variable
2025-07-23 12:45:12 +02:00
ntohidi
db6ad7a79d fix: update links in README and C4A-Script documentation for accuracy 2025-07-23 09:47:18 +02:00
Nasrin
004d514f33 Merge pull request #1265 from unclecode/feature/nasrin-cli-deep-crawl
Feature/CLI - deep-crawl: Add --deep-crawl CLI option with BFS/DFS/Best-First strategies and fix serialization error. ref #874
2025-07-23 09:40:33 +02:00
Vinit Agrawal
3a9e2c716e Remvoed the incorrect reference in browser_config variable 2025-07-18 10:01:00 +05:30
ntohidi
ee25c771d8 feat(cli): add deep crawling options with configurable strategies and max pages. ref #874 2025-07-02 14:07:23 +02:00
12 changed files with 359 additions and 33 deletions

141
.github/workflows/release.yml vendored Normal file
View File

@@ -0,0 +1,141 @@
name: Release Pipeline
on:
push:
tags:
- 'v*'
- '!test-v*' # Exclude test tags
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Extract version from tag
id: get_version
run: |
TAG_VERSION=${GITHUB_REF#refs/tags/v}
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
echo "Releasing version: $TAG_VERSION"
- name: Install package dependencies
run: |
pip install -e .
- name: Check version consistency
run: |
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
echo "Tag version: $TAG_VERSION"
echo "Package version: $PACKAGE_VERSION"
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
echo "Please update crawl4ai/__version__.py to match the tag version"
exit 1
fi
echo "✅ Version check passed: $TAG_VERSION"
- name: Install build dependencies
run: |
python -m pip install --upgrade pip
pip install build twine
- name: Build package
run: python -m build
- name: Check package
run: twine check dist/*
- name: Upload to PyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
run: |
echo "📦 Uploading to PyPI..."
twine upload dist/*
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}
- name: Extract major and minor versions
id: versions
run: |
VERSION=${{ steps.get_version.outputs.VERSION }}
MAJOR=$(echo $VERSION | cut -d. -f1)
MINOR=$(echo $VERSION | cut -d. -f1-2)
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
- name: Build and push Docker images
uses: docker/build-push-action@v5
with:
context: .
push: true
tags: |
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
unclecode/crawl4ai:latest
platforms: linux/amd64,linux/arm64
- name: Create GitHub Release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tag_name: v${{ steps.get_version.outputs.VERSION }}
release_name: Release v${{ steps.get_version.outputs.VERSION }}
body: |
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
### 📦 Installation
**PyPI:**
```bash
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
```
**Docker:**
```bash
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
docker pull unclecode/crawl4ai:latest
```
### 📝 What's Changed
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
draft: false
prerelease: false
- name: Summary
run: |
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY

View File

@@ -0,0 +1,116 @@
name: Test Release Pipeline
on:
push:
tags:
- 'test-v*'
jobs:
test-release:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: Extract version from tag
id: get_version
run: |
TAG_VERSION=${GITHUB_REF#refs/tags/test-v}
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
echo "Testing with version: $TAG_VERSION"
- name: Install package dependencies
run: |
pip install -e .
- name: Check version consistency
run: |
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
echo "Tag version: $TAG_VERSION"
echo "Package version: $PACKAGE_VERSION"
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
echo "Please update crawl4ai/__version__.py to match the tag version"
exit 1
fi
echo "✅ Version check passed: $TAG_VERSION"
- name: Install build dependencies
run: |
python -m pip install --upgrade pip
pip install build twine
- name: Build package
run: python -m build
- name: Check package
run: twine check dist/*
- name: Upload to Test PyPI
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
run: |
echo "📦 Uploading to Test PyPI..."
twine upload --repository testpypi dist/* || {
if [ $? -eq 1 ]; then
echo "⚠️ Upload failed - likely version already exists on Test PyPI"
echo "Continuing anyway for test purposes..."
else
exit 1
fi
}
echo "✅ Test PyPI step complete"
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}
- name: Build and push Docker test images
uses: docker/build-push-action@v5
with:
context: .
push: true
tags: |
unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}
unclecode/crawl4ai:test-latest
platforms: linux/amd64,linux/arm64
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Summary
run: |
echo "## 🎉 Test Release Complete!" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 📦 Test PyPI Package" >> $GITHUB_STEP_SUMMARY
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 🐳 Docker Test Images" >> $GITHUB_STEP_SUMMARY
echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### 🧹 Cleanup Commands" >> $GITHUB_STEP_SUMMARY
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY
echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY
echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY

View File

@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
[✨ Check out latest update v0.7.0](#-recent-updates)
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
<details>
<summary>🤓 <strong>My Personal Story</strong></summary>

View File

@@ -1,7 +1,7 @@
# crawl4ai/__version__.py
# This is the version that will be used for stable releases
__version__ = "0.7.1"
__version__ = "0.7.2"
# For nightly builds, this gets set during build process
__nightly_version__ = None

View File

@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
except Error:
visibility_info = await self.check_visibility(page)
if self.browser_config.config.verbose:
if self.browser_config.verbose:
self.logger.debug(
message="Body visibility info: {info}",
tag="DEBUG",

View File

@@ -502,9 +502,12 @@ class AsyncWebCrawler:
metadata = result.get("metadata", {})
else:
cleaned_html = sanitize_input_encode(result.cleaned_html)
media = result.media.model_dump()
tables = media.pop("tables", [])
links = result.links.model_dump()
# media = result.media.model_dump()
# tables = media.pop("tables", [])
# links = result.links.model_dump()
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
tables = media.pop("tables", []) if isinstance(media, dict) else []
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
metadata = result.metadata
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

View File

@@ -27,7 +27,10 @@ from crawl4ai import (
PruningContentFilter,
BrowserProfiler,
DefaultMarkdownGenerator,
LLMConfig
LLMConfig,
BFSDeepCrawlStrategy,
DFSDeepCrawlStrategy,
BestFirstCrawlingStrategy,
)
from crawl4ai.config import USER_SETTINGS
from litellm import completion
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
"""Crawl a website and extract content
Simple Usage:
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
# Handle deep crawling configuration
if deep_crawl:
if deep_crawl == "bfs":
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=3,
max_pages=max_pages
)
elif deep_crawl == "dfs":
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=max_pages
)
elif deep_crawl == "best-first":
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
max_depth=3,
max_pages=max_pages
)
if verbose:
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
config = get_global_config()
browser_cfg.verbose = config.get("VERBOSE", False)
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
verbose
)
# Handle deep crawl results (list) vs single result
if isinstance(result, list):
if len(result) == 0:
click.echo("No results found during deep crawling")
return
# Use the first result for question answering and output
main_result = result[0]
all_results = result
else:
# Single result from regular crawling
main_result = result
all_results = [result]
# Handle question
if question:
provider, token = setup_llm_config()
markdown = result.markdown.raw_markdown
markdown = main_result.markdown.raw_markdown
anyio.run(stream_llm_response, url, markdown, question, provider, token)
return
# Handle output
if not output_file:
if output == "all":
click.echo(json.dumps(result.model_dump(), indent=2))
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
click.echo(json.dumps(output_data, indent=2))
else:
click.echo(json.dumps(main_result.model_dump(), indent=2))
elif output == "json":
print(result.extracted_content)
extracted_items = json.loads(result.extracted_content)
print(main_result.extracted_content)
extracted_items = json.loads(main_result.extracted_content)
click.echo(json.dumps(extracted_items, indent=2))
elif output in ["markdown", "md"]:
click.echo(result.markdown.raw_markdown)
click.echo(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
click.echo(result.markdown.fit_markdown)
click.echo(main_result.markdown.fit_markdown)
else:
if output == "all":
with open(output_file, "w") as f:
f.write(json.dumps(result.model_dump(), indent=2))
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
f.write(json.dumps(output_data, indent=2))
else:
f.write(json.dumps(main_result.model_dump(), indent=2))
elif output == "json":
with open(output_file, "w") as f:
f.write(result.extracted_content)
f.write(main_result.extracted_content)
elif output in ["markdown", "md"]:
with open(output_file, "w") as f:
f.write(result.markdown.raw_markdown)
f.write(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
with open(output_file, "w") as f:
f.write(result.markdown.fit_markdown)
f.write(main_result.markdown.fit_markdown)
except Exception as e:
raise click.ClickException(str(e))
@@ -1354,9 +1401,11 @@ def profiles_cmd():
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
"""Crawl4AI CLI - Web content extraction tool
Simple Usage:
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
bypass_cache=bypass_cache,
question=question,
verbose=verbose,
profile=profile
profile=profile,
deep_crawl=deep_crawl,
max_pages=max_pages
)
def main():

View File

@@ -3342,7 +3342,13 @@ async def get_text_embeddings(
# Default: use sentence-transformers
else:
# Lazy load to avoid importing heavy libraries unless needed
from sentence_transformers import SentenceTransformer
try:
from sentence_transformers import SentenceTransformer
except ImportError:
raise ImportError(
"sentence-transformers is required for local embeddings. "
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
)
# Cache the model in function attribute to avoid reloading
if not hasattr(get_text_embeddings, '_models'):

View File

@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
from functools import partial
from uuid import uuid4
from datetime import datetime
from base64 import b64encode
import logging
from typing import Optional, AsyncGenerator
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
server_memory_mb = _get_memory_mb()
result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
data = json.dumps(result_dict, default=datetime_handler) + "\n"
yield data.encode('utf-8')
@@ -443,10 +447,19 @@ async def handle_crawl_request(
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
# Process results to handle PDF bytes
processed_results = []
for result in results:
result_dict = result.model_dump()
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)
return {
"success": True,
"results": [result.model_dump() for result in results],
"results": processed_results,
"server_processing_time_s": end_time - start_time,
"server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": peak_mem_mb

View File

@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
Want to learn by doing? We've got you covered:
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
### Running the Tutorial Locally

View File

@@ -44,7 +44,6 @@ dependencies = [
"brotli>=1.1.0",
"humanize>=4.10.0",
"lark>=1.2.2",
"sentence-transformers>=2.2.0",
"alphashape>=1.3.1",
"shapely>=2.0.0"
]
@@ -62,8 +61,8 @@ classifiers = [
[project.optional-dependencies]
pdf = ["PyPDF2"]
torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers"]
cosine = ["torch", "transformers", "nltk"]
transformer = ["transformers", "tokenizers", "sentence-transformers"]
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
sync = ["selenium"]
all = [
"PyPDF2",
@@ -72,8 +71,8 @@ all = [
"scikit-learn",
"transformers",
"tokenizers",
"selenium",
"PyPDF2"
"sentence-transformers",
"selenium"
]
[project.scripts]

View File

@@ -24,7 +24,6 @@ cssselect>=1.2.0
chardet>=5.2.0
brotli>=1.1.0
httpx[http2]>=0.27.2
sentence-transformers>=2.2.0
alphashape>=1.3.1
shapely>=2.0.0