Merge branch '2025-APR-1' into 2025-MAY-2
This commit is contained in:
@@ -744,18 +744,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
)
|
)
|
||||||
redirected_url = page.url
|
redirected_url = page.url
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
# Allow navigation to be aborted when downloading files
|
||||||
|
# This is expected behavior for downloads in some browser engines
|
||||||
|
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
|
||||||
|
self.logger.info(
|
||||||
|
message=f"Navigation aborted, likely due to file download: {url}",
|
||||||
|
tag="GOTO",
|
||||||
|
params={"url": url},
|
||||||
|
)
|
||||||
|
response = None
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||||
|
|
||||||
await self.execute_hook(
|
await self.execute_hook(
|
||||||
"after_goto", page, context=context, url=url, response=response, config=config
|
"after_goto", page, context=context, url=url, response=response, config=config
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────
|
||||||
|
# Walk the redirect chain. Playwright returns only the last
|
||||||
|
# hop, so we trace the `request.redirected_from` links until the
|
||||||
|
# first response that differs from the final one and surface its
|
||||||
|
# status-code.
|
||||||
|
# ──────────────────────────────────────────────────────────────
|
||||||
if response is None:
|
if response is None:
|
||||||
status_code = 200
|
status_code = 200
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
else:
|
else:
|
||||||
status_code = response.status
|
first_resp = response
|
||||||
response_headers = response.headers
|
req = response.request
|
||||||
|
while req and req.redirected_from:
|
||||||
|
prev_req = req.redirected_from
|
||||||
|
prev_resp = await prev_req.response()
|
||||||
|
if prev_resp: # keep earliest
|
||||||
|
first_resp = prev_resp
|
||||||
|
req = prev_req
|
||||||
|
|
||||||
|
status_code = first_resp.status
|
||||||
|
response_headers = first_resp.headers
|
||||||
|
# if response is None:
|
||||||
|
# status_code = 200
|
||||||
|
# response_headers = {}
|
||||||
|
# else:
|
||||||
|
# status_code = response.status
|
||||||
|
# response_headers = response.headers
|
||||||
|
|
||||||
else:
|
else:
|
||||||
status_code = 200
|
status_code = 200
|
||||||
@@ -1435,12 +1466,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
num_segments = (page_height // viewport_height) + 1
|
num_segments = (page_height // viewport_height) + 1
|
||||||
for i in range(num_segments):
|
for i in range(num_segments):
|
||||||
y_offset = i * viewport_height
|
y_offset = i * viewport_height
|
||||||
|
# Special handling for the last segment
|
||||||
|
if i == num_segments - 1:
|
||||||
|
last_part_height = page_height % viewport_height
|
||||||
|
|
||||||
|
# If page_height is an exact multiple of viewport_height,
|
||||||
|
# we don't need an extra segment
|
||||||
|
if last_part_height == 0:
|
||||||
|
# Skip last segment if page height is exact multiple of viewport
|
||||||
|
break
|
||||||
|
|
||||||
|
# Adjust viewport to exactly match the remaining content height
|
||||||
|
await page.set_viewport_size({"width": page_width, "height": last_part_height})
|
||||||
|
|
||||||
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||||
await asyncio.sleep(0.01) # wait for render
|
await asyncio.sleep(0.01) # wait for render
|
||||||
seg_shot = await page.screenshot(full_page=False)
|
|
||||||
|
# Capture the current segment
|
||||||
|
# Note: Using compression options (format, quality) would go here
|
||||||
|
seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
|
||||||
|
# seg_shot = await page.screenshot(full_page=False)
|
||||||
img = Image.open(BytesIO(seg_shot)).convert("RGB")
|
img = Image.open(BytesIO(seg_shot)).convert("RGB")
|
||||||
segments.append(img)
|
segments.append(img)
|
||||||
|
|
||||||
|
# Reset viewport to original size after capturing segments
|
||||||
|
await page.set_viewport_size({"width": page_width, "height": viewport_height})
|
||||||
|
|
||||||
total_height = sum(img.height for img in segments)
|
total_height = sum(img.height for img in segments)
|
||||||
stitched = Image.new("RGB", (segments[0].width, total_height))
|
stitched = Image.new("RGB", (segments[0].width, total_height))
|
||||||
offset = 0
|
offset = 0
|
||||||
|
|||||||
@@ -585,11 +585,13 @@ class AsyncWebCrawler:
|
|||||||
# Choose content based on input_format
|
# Choose content based on input_format
|
||||||
content_format = config.extraction_strategy.input_format
|
content_format = config.extraction_strategy.input_format
|
||||||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||||||
self.logger.warning(
|
|
||||||
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
self.logger.url_status(
|
||||||
tag="EXTRACT",
|
url=_url,
|
||||||
params={"url": _url},
|
success=bool(html),
|
||||||
)
|
timing=time.perf_counter() - t1,
|
||||||
|
tag="EXTRACT",
|
||||||
|
)
|
||||||
content_format = "markdown"
|
content_format = "markdown"
|
||||||
|
|
||||||
content = {
|
content = {
|
||||||
@@ -613,11 +615,12 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Log extraction completion
|
# Log extraction completion
|
||||||
self.logger.info(
|
self.logger.url_status(
|
||||||
message="Completed for {url:.50}... | Time: {timing}s",
|
url=_url,
|
||||||
tag="EXTRACT",
|
success=bool(html),
|
||||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
timing=time.perf_counter() - t1,
|
||||||
)
|
tag="EXTRACT",
|
||||||
|
)
|
||||||
|
|
||||||
# Apply HTML formatting if requested
|
# Apply HTML formatting if requested
|
||||||
if config.prettiify:
|
if config.prettiify:
|
||||||
|
|||||||
@@ -615,9 +615,18 @@ class BrowserProfiler:
|
|||||||
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
|
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
|
||||||
self.logger.info(f"Headless mode: {headless}", tag="CDP")
|
self.logger.info(f"Headless mode: {headless}", tag="CDP")
|
||||||
|
|
||||||
|
# create browser config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type=browser_type,
|
||||||
|
headless=headless,
|
||||||
|
user_data_dir=profile_path,
|
||||||
|
debugging_port=debugging_port,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
|
||||||
# Create managed browser instance
|
# Create managed browser instance
|
||||||
managed_browser = ManagedBrowser(
|
managed_browser = ManagedBrowser(
|
||||||
browser_type=browser_type,
|
browser_config=browser_config,
|
||||||
user_data_dir=profile_path,
|
user_data_dir=profile_path,
|
||||||
headless=headless,
|
headless=headless,
|
||||||
logger=self.logger,
|
logger=self.logger,
|
||||||
|
|||||||
@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
# Check flag if we should remove external images
|
# Check flag if we should remove external images
|
||||||
if kwargs.get("exclude_external_images", False):
|
if kwargs.get("exclude_external_images", False):
|
||||||
element.decompose()
|
# Handle relative URLs (which are always from the same domain)
|
||||||
return False
|
if not src.startswith('http') and not src.startswith('//'):
|
||||||
# src_url_base = src.split('/')[2]
|
return True # Keep relative URLs
|
||||||
# url_base = url.split('/')[2]
|
|
||||||
# if url_base not in src_url_base:
|
# For absolute URLs, compare the base domains using the existing function
|
||||||
# element.decompose()
|
src_base_domain = get_base_domain(src)
|
||||||
# return False
|
url_base_domain = get_base_domain(url)
|
||||||
|
|
||||||
|
# If the domains don't match and both are valid, the image is external
|
||||||
|
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
|
|
||||||
# if kwargs.get('exclude_social_media_links', False):
|
# if kwargs.get('exclude_social_media_links', False):
|
||||||
# if image_src_base_domain in exclude_social_media_domains:
|
# if image_src_base_domain in exclude_social_media_domains:
|
||||||
|
|||||||
@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Calculate how many more URLs we can process in this batch
|
||||||
|
remaining = self.max_pages - self._pages_crawled
|
||||||
|
batch_size = min(BATCH_SIZE, remaining)
|
||||||
|
if batch_size <= 0:
|
||||||
|
# No more pages to crawl
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
|
break
|
||||||
|
|
||||||
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
||||||
# Retrieve up to BATCH_SIZE items from the priority queue.
|
# Retrieve up to BATCH_SIZE items from the priority queue.
|
||||||
for _ in range(BATCH_SIZE):
|
for _ in range(BATCH_SIZE):
|
||||||
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|||||||
@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
results: List[CrawlResult] = []
|
results: List[CrawlResult] = []
|
||||||
|
|
||||||
while current_level and not self._cancel_event.is_set():
|
while current_level and not self._cancel_event.is_set():
|
||||||
|
# Check if we've already reached max_pages before starting a new level
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||||
|
break
|
||||||
|
|
||||||
next_level: List[Tuple[str, Optional[str]]] = []
|
next_level: List[Tuple[str, Optional[str]]] = []
|
||||||
urls = [url for url, _ in current_level]
|
urls = [url for url, _ in current_level]
|
||||||
|
|
||||||
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
# Count only successful crawls
|
# Count only successful crawls
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
results_count += 1
|
results_count += 1
|
||||||
yield result
|
yield result
|
||||||
|
|||||||
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
# Count only successful crawls toward max_pages limit
|
# Count only successful crawls toward max_pages limit
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
# Only discover links from successful crawls
|
# Only discover links from successful crawls
|
||||||
new_links: List[Tuple[str, Optional[str]]] = []
|
new_links: List[Tuple[str, Optional[str]]] = []
|
||||||
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
|||||||
# and only discover links from successful crawls
|
# and only discover links from successful crawls
|
||||||
if result.success:
|
if result.success:
|
||||||
self._pages_crawled += 1
|
self._pages_crawled += 1
|
||||||
|
# Check if we've reached the limit during batch processing
|
||||||
|
if self._pages_crawled >= self.max_pages:
|
||||||
|
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||||
|
break # Exit the generator
|
||||||
|
|
||||||
new_links: List[Tuple[str, Optional[str]]] = []
|
new_links: List[Tuple[str, Optional[str]]] = []
|
||||||
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
||||||
|
|||||||
@@ -42,6 +42,29 @@ from itertools import chain
|
|||||||
from collections import deque
|
from collections import deque
|
||||||
from typing import Generator, Iterable
|
from typing import Generator, Iterable
|
||||||
|
|
||||||
|
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||||
|
from urllib.robotparser import RuleLine
|
||||||
|
import re
|
||||||
|
|
||||||
|
original_applies_to = RuleLine.applies_to
|
||||||
|
|
||||||
|
def patched_applies_to(self, filename):
|
||||||
|
# Handle wildcards in paths
|
||||||
|
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
|
||||||
|
pattern = self.path.replace('%2A', '*')
|
||||||
|
pattern = re.escape(pattern).replace('\\*', '.*')
|
||||||
|
pattern = '^' + pattern
|
||||||
|
if pattern.endswith('\\$'):
|
||||||
|
pattern = pattern[:-2] + '$'
|
||||||
|
try:
|
||||||
|
return bool(re.match(pattern, filename))
|
||||||
|
except re.error:
|
||||||
|
return original_applies_to(self, filename)
|
||||||
|
return original_applies_to(self, filename)
|
||||||
|
|
||||||
|
RuleLine.applies_to = patched_applies_to
|
||||||
|
# Monkey patch ends
|
||||||
|
|
||||||
def chunk_documents(
|
def chunk_documents(
|
||||||
documents: Iterable[str],
|
documents: Iterable[str],
|
||||||
chunk_token_threshold: int,
|
chunk_token_threshold: int,
|
||||||
@@ -303,7 +326,7 @@ class RobotsParser:
|
|||||||
robots_url = f"{scheme}://{domain}/robots.txt"
|
robots_url = f"{scheme}://{domain}/robots.txt"
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.get(robots_url, timeout=2) as response:
|
async with session.get(robots_url, timeout=2, ssl=False) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
rules = await response.text()
|
rules = await response.text()
|
||||||
self._cache_rules(domain, rules)
|
self._cache_rules(domain, rules)
|
||||||
|
|||||||
@@ -403,7 +403,7 @@ async def main():
|
|||||||
|
|
||||||
md_generator = DefaultMarkdownGenerator(
|
md_generator = DefaultMarkdownGenerator(
|
||||||
content_filter=filter,
|
content_filter=filter,
|
||||||
options={"ignore_links": True}
|
options={"ignore_links": True})
|
||||||
|
|
||||||
# 4) Crawler run config: skip cache, use extraction
|
# 4) Crawler run config: skip cache, use extraction
|
||||||
run_conf = CrawlerRunConfig(
|
run_conf = CrawlerRunConfig(
|
||||||
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
|
|||||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -4175,8 +4175,13 @@ async def main():
|
|||||||
verbose=True
|
verbose=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=filter,
|
||||||
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
content_filter=filter
|
markdown_generator=md_generator
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
|||||||
@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -298,7 +298,7 @@ async def main():
|
|||||||
# 3) Example LLM content filtering
|
# 3) Example LLM content filtering
|
||||||
|
|
||||||
gemini_config = LLMConfig(
|
gemini_config = LLMConfig(
|
||||||
provider="gemini/gemini-1.5-pro"
|
provider="gemini/gemini-1.5-pro",
|
||||||
api_token = "env:GEMINI_API_TOKEN"
|
api_token = "env:GEMINI_API_TOKEN"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -322,8 +322,9 @@ async def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
md_generator = DefaultMarkdownGenerator(
|
md_generator = DefaultMarkdownGenerator(
|
||||||
content_filter=filter,
|
content_filter=filter,
|
||||||
options={"ignore_links": True}
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
|
|
||||||
# 4) Crawler run config: skip cache, use extraction
|
# 4) Crawler run config: skip cache, use extraction
|
||||||
run_conf = CrawlerRunConfig(
|
run_conf = CrawlerRunConfig(
|
||||||
|
|||||||
@@ -17,6 +17,9 @@
|
|||||||
- [Configuration Reference](#configuration-reference)
|
- [Configuration Reference](#configuration-reference)
|
||||||
- [Best Practices & Tips](#best-practices--tips)
|
- [Best Practices & Tips](#best-practices--tips)
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
The Crawl4AI CLI will be installed automatically when you install the library.
|
||||||
|
|
||||||
## Basic Usage
|
## Basic Usage
|
||||||
|
|
||||||
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
|
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
|
||||||
|
|||||||
@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
|
|||||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -255,9 +255,12 @@ async def main():
|
|||||||
chunk_token_threshold=4096, # Adjust based on your needs
|
chunk_token_threshold=4096, # Adjust based on your needs
|
||||||
verbose=True
|
verbose=True
|
||||||
)
|
)
|
||||||
|
md_generator = DefaultMarkdownGenerator(
|
||||||
|
content_filter=filter,
|
||||||
|
options={"ignore_links": True}
|
||||||
|
)
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
content_filter=filter
|
markdown_generator=md_generator,
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ dependencies = [
|
|||||||
"lxml~=5.3",
|
"lxml~=5.3",
|
||||||
"litellm>=1.53.1",
|
"litellm>=1.53.1",
|
||||||
"numpy>=1.26.0,<3",
|
"numpy>=1.26.0,<3",
|
||||||
"pillow~=10.4",
|
"pillow>=10.4",
|
||||||
"playwright>=1.49.0",
|
"playwright>=1.49.0",
|
||||||
"python-dotenv~=1.0",
|
"python-dotenv~=1.0",
|
||||||
"requests~=2.26",
|
"requests~=2.26",
|
||||||
@@ -33,7 +33,6 @@ dependencies = [
|
|||||||
"psutil>=6.1.1",
|
"psutil>=6.1.1",
|
||||||
"nltk>=3.9.1",
|
"nltk>=3.9.1",
|
||||||
"playwright",
|
"playwright",
|
||||||
"aiofiles",
|
|
||||||
"rich>=13.9.4",
|
"rich>=13.9.4",
|
||||||
"cssselect>=1.2.0",
|
"cssselect>=1.2.0",
|
||||||
"httpx>=0.27.2",
|
"httpx>=0.27.2",
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ aiosqlite~=0.20
|
|||||||
lxml~=5.3
|
lxml~=5.3
|
||||||
litellm>=1.53.1
|
litellm>=1.53.1
|
||||||
numpy>=1.26.0,<3
|
numpy>=1.26.0,<3
|
||||||
pillow~=10.4
|
pillow>=10.4
|
||||||
playwright>=1.49.0
|
playwright>=1.49.0
|
||||||
python-dotenv~=1.0
|
python-dotenv~=1.0
|
||||||
requests~=2.26
|
requests~=2.26
|
||||||
@@ -23,3 +23,6 @@ rich>=13.9.4
|
|||||||
cssselect>=1.2.0
|
cssselect>=1.2.0
|
||||||
chardet>=5.2.0
|
chardet>=5.2.0
|
||||||
brotli>=1.1.0
|
brotli>=1.1.0
|
||||||
|
fake-useragent>=2.2.0
|
||||||
|
pdf2image>=1.17.0
|
||||||
|
PyPDF2>=3.0.1
|
||||||
Reference in New Issue
Block a user