Merge branch 'main' into main
This commit is contained in:
@@ -58,13 +58,15 @@ Pull and run images directly from Docker Hub without building locally.
|
||||
|
||||
#### 1. Pull the Image
|
||||
|
||||
Our latest release candidate is `0.6.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
|
||||
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
||||
|
||||
```bash
|
||||
# Pull the release candidate (recommended for latest features)
|
||||
docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
# Pull the release candidate (for testing new features)
|
||||
docker pull unclecode/crawl4ai:0.7.0-r1
|
||||
|
||||
# Or pull the latest stable version
|
||||
# Or pull the current stable version (0.6.0)
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
@@ -99,7 +101,7 @@ EOL
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
```
|
||||
|
||||
* **With LLM support:**
|
||||
@@ -110,7 +112,7 @@ EOL
|
||||
--name crawl4ai \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
```
|
||||
|
||||
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
||||
@@ -124,7 +126,7 @@ docker stop crawl4ai && docker rm crawl4ai
|
||||
#### Docker Hub Versioning Explained
|
||||
|
||||
* **Image Name:** `unclecode/crawl4ai`
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r1`)
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.0-r1`)
|
||||
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
|
||||
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
|
||||
* **`latest` Tag:** Points to the most recent stable version
|
||||
@@ -160,7 +162,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
|
||||
```bash
|
||||
# Pulls and runs the release candidate from Docker Hub
|
||||
# Automatically selects the correct architecture
|
||||
IMAGE=unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number docker compose up -d
|
||||
IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
|
||||
```
|
||||
|
||||
* **Build and Run Locally:**
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
||||
from functools import partial
|
||||
from uuid import uuid4
|
||||
from datetime import datetime, timezone
|
||||
from base64 import b64encode
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
server_memory_mb = _get_memory_mb()
|
||||
result_dict = result.model_dump()
|
||||
result_dict['server_memory_mb'] = server_memory_mb
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||
yield data.encode('utf-8')
|
||||
@@ -443,10 +447,19 @@ async def handle_crawl_request(
|
||||
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
|
||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||
|
||||
|
||||
# Process results to handle PDF bytes
|
||||
processed_results = []
|
||||
for result in results:
|
||||
result_dict = result.model_dump()
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
processed_results.append(result_dict)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results],
|
||||
"results": processed_results,
|
||||
"server_processing_time_s": end_time - start_time,
|
||||
"server_memory_delta_mb": mem_delta_mb,
|
||||
"server_peak_memory_mb": peak_mem_mb
|
||||
@@ -459,7 +472,7 @@ async def handle_crawl_request(
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||
logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||
logger.error(f"Error closing crawler during exception handling: {str(e)}")
|
||||
|
||||
# Measure memory even on error if possible
|
||||
end_mem_mb_error = _get_memory_mb()
|
||||
@@ -518,7 +531,7 @@ async def handle_stream_crawl_request(
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
|
||||
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
|
||||
# Raising HTTPException here will prevent streaming response
|
||||
raise HTTPException(
|
||||
|
||||
@@ -1263,7 +1263,7 @@ class LLMConfig:
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
temprature: Optional[float] = None,
|
||||
temperature: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
top_p: Optional[float] = None,
|
||||
frequency_penalty: Optional[float] = None,
|
||||
@@ -1291,7 +1291,7 @@ class LLMConfig:
|
||||
self.provider = DEFAULT_PROVIDER
|
||||
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
||||
self.base_url = base_url
|
||||
self.temprature = temprature
|
||||
self.temperature = temperature
|
||||
self.max_tokens = max_tokens
|
||||
self.top_p = top_p
|
||||
self.frequency_penalty = frequency_penalty
|
||||
@@ -1305,7 +1305,7 @@ class LLMConfig:
|
||||
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
||||
api_token=kwargs.get("api_token"),
|
||||
base_url=kwargs.get("base_url"),
|
||||
temprature=kwargs.get("temprature"),
|
||||
temperature=kwargs.get("temperature"),
|
||||
max_tokens=kwargs.get("max_tokens"),
|
||||
top_p=kwargs.get("top_p"),
|
||||
frequency_penalty=kwargs.get("frequency_penalty"),
|
||||
@@ -1319,7 +1319,7 @@ class LLMConfig:
|
||||
"provider": self.provider,
|
||||
"api_token": self.api_token,
|
||||
"base_url": self.base_url,
|
||||
"temprature": self.temprature,
|
||||
"temperature": self.temperature,
|
||||
"max_tokens": self.max_tokens,
|
||||
"top_p": self.top_p,
|
||||
"frequency_penalty": self.frequency_penalty,
|
||||
@@ -4075,7 +4075,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
api_token: The API token for the provider.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc.
|
||||
"""
|
||||
super().__init__( input_format=input_format, **kwargs)
|
||||
self.llm_config = llm_config
|
||||
@@ -7901,7 +7901,7 @@ from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.extraction_strategy import (
|
||||
from crawl4ai import (
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
)
|
||||
@@ -8301,7 +8301,7 @@ async def crawl_dynamic_content_pages_method_2():
|
||||
|
||||
|
||||
async def cosine_similarity_extraction():
|
||||
from crawl4ai.extraction_strategy import CosineStrategy
|
||||
from crawl4ai import CosineStrategy
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=CosineStrategy(
|
||||
|
||||
@@ -332,7 +332,7 @@ The `clone()` method:
|
||||
### Key fields to note
|
||||
|
||||
1. **`provider`**:
|
||||
- Which LLM provoder to use.
|
||||
- Which LLM provider to use.
|
||||
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
|
||||
|
||||
2. **`api_token`**:
|
||||
@@ -354,7 +354,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
# 1) Browser config: headless, bigger viewport, no proxy
|
||||
@@ -403,7 +403,7 @@ async def main():
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
options={"ignore_links": True})
|
||||
|
||||
# 4) Crawler run config: skip cache, use extraction
|
||||
run_conf = CrawlerRunConfig(
|
||||
@@ -1042,7 +1042,7 @@ You can combine content selection with a more advanced extraction strategy. For
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
# Minimal schema for repeated items
|
||||
@@ -1094,7 +1094,7 @@ import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
|
||||
class ArticleData(BaseModel):
|
||||
headline: str
|
||||
@@ -1139,7 +1139,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def extract_main_articles(url: str):
|
||||
schema = {
|
||||
@@ -1488,7 +1488,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
schema = {
|
||||
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_web():
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/apple",
|
||||
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_local_file():
|
||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||
file_url = f"file://{local_file_path}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=file_url, config=config)
|
||||
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_raw_html():
|
||||
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
||||
raw_html_url = f"raw:{raw_html}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=raw_html_url, config=config)
|
||||
@@ -3845,7 +3845,7 @@ import os
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
@@ -3856,7 +3856,7 @@ async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Step 1: Crawl the Web URL
|
||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
||||
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||
|
||||
if not result.success:
|
||||
@@ -3871,7 +3871,7 @@ async def main():
|
||||
# Step 2: Crawl from the Local HTML File
|
||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||
file_url = f"file://{html_file_path.resolve()}"
|
||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
||||
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||
|
||||
if not local_result.success:
|
||||
@@ -3887,7 +3887,7 @@ async def main():
|
||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||
raw_html_content = f.read()
|
||||
raw_html_url = f"raw:{raw_html_content}"
|
||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
||||
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||
|
||||
if not raw_result.success:
|
||||
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
@@ -4175,8 +4175,13 @@ async def main():
|
||||
verbose=True
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=filter
|
||||
markdown_generator=md_generator
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -4722,7 +4727,7 @@ if __name__ == "__main__":
|
||||
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
schema = {
|
||||
"name": "Commits",
|
||||
@@ -4902,7 +4907,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
||||
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Generate a schema (one-time cost)
|
||||
@@ -4932,7 +4937,7 @@ Here's a basic extraction example:
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
schema = {
|
||||
@@ -4987,7 +4992,7 @@ import json
|
||||
import asyncio
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
@@ -5103,7 +5108,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def extract_structured_data_using_css_extractor():
|
||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
|
||||
```python
|
||||
import os, asyncio
|
||||
from base64 import b64decode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
pdf=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Save screenshot
|
||||
print(f"Screenshot data present: {result.screenshot is not None}")
|
||||
print(f"PDF data present: {result.pdf is not None}")
|
||||
|
||||
if result.screenshot:
|
||||
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
|
||||
with open("wikipedia_screenshot.png", "wb") as f:
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
else:
|
||||
print("[WARN] Screenshot data is None.")
|
||||
|
||||
if result.pdf:
|
||||
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
|
||||
with open("wikipedia_page.pdf", "wb") as f:
|
||||
f.write(result.pdf)
|
||||
|
||||
print("[OK] PDF & screenshot captured.")
|
||||
else:
|
||||
print("[WARN] PDF data is None.")
|
||||
|
||||
else:
|
||||
print("[ERROR]", result.error_message)
|
||||
|
||||
@@ -6705,7 +6719,7 @@ dispatcher = MemoryAdaptiveDispatcher(
|
||||
3. **`max_session_permit`** (`int`, default: `10`)
|
||||
The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
|
||||
|
||||
4. **`memory_wait_timeout`** (`float`, default: `300.0`)
|
||||
4. **`memory_wait_timeout`** (`float`, default: `600.0`)
|
||||
Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
|
||||
|
||||
5. **`rate_limiter`** (`RateLimiter`, default: `None`)
|
||||
@@ -7300,7 +7314,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
|
||||
async def crawl_dynamic_content():
|
||||
@@ -7850,7 +7864,7 @@ The Cosine Strategy:
|
||||
## Basic Usage
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import CosineStrategy
|
||||
from crawl4ai import CosineStrategy
|
||||
|
||||
strategy = CosineStrategy(
|
||||
semantic_filter="product reviews", # Target content type
|
||||
@@ -8161,7 +8175,7 @@ import json
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
|
||||
class Product(BaseModel):
|
||||
name: str
|
||||
@@ -8278,7 +8292,7 @@ import asyncio
|
||||
from typing import List
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
|
||||
class Entity(BaseModel):
|
||||
name: str
|
||||
@@ -8423,7 +8437,7 @@ Let’s begin with a **simple** schema-based extraction using the `JsonCssExtrac
|
||||
import json
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def extract_crypto_prices():
|
||||
# 1. Define a simple extraction schema
|
||||
@@ -8493,7 +8507,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
|
||||
import json
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
||||
from crawl4ai import JsonXPathExtractionStrategy
|
||||
|
||||
async def extract_crypto_prices_xpath():
|
||||
# 1. Minimal dummy HTML with some repeating rows
|
||||
@@ -8694,7 +8708,7 @@ Key Takeaways:
|
||||
import json
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
ecommerce_schema = {
|
||||
# ... the advanced schema from above ...
|
||||
@@ -8804,7 +8818,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
|
||||
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Sample HTML with product information
|
||||
|
||||
@@ -14,3 +14,4 @@ anyio==4.9.0
|
||||
PyJWT==2.10.1
|
||||
mcp>=1.6.0
|
||||
websockets>=15.0.1
|
||||
httpx[http2]>=0.27.2
|
||||
|
||||
@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
|
||||
class MarkdownRequest(BaseModel):
|
||||
"""Request body for the /md endpoint."""
|
||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||
f: FilterType = Field(FilterType.FIT,
|
||||
description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
|
||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
|
||||
|
||||
@@ -671,6 +671,16 @@
|
||||
method: 'GET',
|
||||
headers: { 'Accept': 'application/json' }
|
||||
});
|
||||
responseData = await response.json();
|
||||
const time = Math.round(performance.now() - startTime);
|
||||
if (!response.ok) {
|
||||
updateStatus('error', time);
|
||||
throw new Error(responseData.error || 'Request failed');
|
||||
}
|
||||
updateStatus('success', time);
|
||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||
document.querySelector('#response-content code').className = 'json hljs';
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
} else if (endpoint === 'crawl_stream') {
|
||||
// Stream processing
|
||||
response = await fetch(api, {
|
||||
|
||||
Reference in New Issue
Block a user