Compare commits
7 Commits
main
...
fix/config
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7a133e22cc | ||
|
|
b36c6daa5c | ||
|
|
94c8a833bf | ||
|
|
84bfea8bd1 | ||
|
|
7771ed3894 | ||
|
|
c2c4d42be4 | ||
|
|
edd0b576b1 |
@@ -167,6 +167,11 @@ RUN mkdir -p /home/appuser/.cache/ms-playwright \
|
|||||||
|
|
||||||
RUN crawl4ai-doctor
|
RUN crawl4ai-doctor
|
||||||
|
|
||||||
|
# Ensure all cache directories belong to appuser
|
||||||
|
# This fixes permission issues with .cache/url_seeder and other runtime cache dirs
|
||||||
|
RUN mkdir -p /home/appuser/.cache \
|
||||||
|
&& chown -R appuser:appuser /home/appuser/.cache
|
||||||
|
|
||||||
# Copy application code
|
# Copy application code
|
||||||
COPY deploy/docker/* ${APP_HOME}/
|
COPY deploy/docker/* ${APP_HOME}/
|
||||||
|
|
||||||
|
|||||||
@@ -728,18 +728,18 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
||||||
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
||||||
|
|
||||||
# response = perform_completion_with_backoff(
|
response = perform_completion_with_backoff(
|
||||||
# provider=provider,
|
provider=provider,
|
||||||
# prompt_with_variables=prompt,
|
prompt_with_variables=prompt,
|
||||||
# api_token=api_token,
|
api_token=api_token,
|
||||||
# json_response=True
|
json_response=True
|
||||||
# )
|
)
|
||||||
|
|
||||||
# variations = json.loads(response.choices[0].message.content)
|
variations = json.loads(response.choices[0].message.content)
|
||||||
|
|
||||||
|
|
||||||
# # Mock data with more variations for split
|
# # Mock data with more variations for split
|
||||||
variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
# variations ={'queries': ['what are the best vegetables to use in fried rice?', 'how do I make vegetable fried rice from scratch?', 'can you provide a quick recipe for vegetable fried rice?', 'what cooking techniques are essential for perfect fried rice with vegetables?', 'how to add flavor to vegetable fried rice?', 'are there any tips for making healthy fried rice with vegetables?']}
|
||||||
|
|
||||||
|
|
||||||
# variations = {'queries': [
|
# variations = {'queries': [
|
||||||
|
|||||||
@@ -1793,6 +1793,9 @@ class LLMConfig:
|
|||||||
presence_penalty: Optional[float] = None,
|
presence_penalty: Optional[float] = None,
|
||||||
stop: Optional[List[str]] = None,
|
stop: Optional[List[str]] = None,
|
||||||
n: Optional[int] = None,
|
n: Optional[int] = None,
|
||||||
|
backoff_base_delay: Optional[int] = None,
|
||||||
|
backoff_max_attempts: Optional[int] = None,
|
||||||
|
backoff_exponential_factor: Optional[int] = None,
|
||||||
):
|
):
|
||||||
"""Configuaration class for LLM provider and API token."""
|
"""Configuaration class for LLM provider and API token."""
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
@@ -1821,6 +1824,9 @@ class LLMConfig:
|
|||||||
self.presence_penalty = presence_penalty
|
self.presence_penalty = presence_penalty
|
||||||
self.stop = stop
|
self.stop = stop
|
||||||
self.n = n
|
self.n = n
|
||||||
|
self.backoff_base_delay = backoff_base_delay if backoff_base_delay is not None else 2
|
||||||
|
self.backoff_max_attempts = backoff_max_attempts if backoff_max_attempts is not None else 3
|
||||||
|
self.backoff_exponential_factor = backoff_exponential_factor if backoff_exponential_factor is not None else 2
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_kwargs(kwargs: dict) -> "LLMConfig":
|
def from_kwargs(kwargs: dict) -> "LLMConfig":
|
||||||
@@ -1834,7 +1840,10 @@ class LLMConfig:
|
|||||||
frequency_penalty=kwargs.get("frequency_penalty"),
|
frequency_penalty=kwargs.get("frequency_penalty"),
|
||||||
presence_penalty=kwargs.get("presence_penalty"),
|
presence_penalty=kwargs.get("presence_penalty"),
|
||||||
stop=kwargs.get("stop"),
|
stop=kwargs.get("stop"),
|
||||||
n=kwargs.get("n")
|
n=kwargs.get("n"),
|
||||||
|
backoff_base_delay=kwargs.get("backoff_base_delay"),
|
||||||
|
backoff_max_attempts=kwargs.get("backoff_max_attempts"),
|
||||||
|
backoff_exponential_factor=kwargs.get("backoff_exponential_factor")
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_dict(self):
|
def to_dict(self):
|
||||||
@@ -1848,7 +1857,10 @@ class LLMConfig:
|
|||||||
"frequency_penalty": self.frequency_penalty,
|
"frequency_penalty": self.frequency_penalty,
|
||||||
"presence_penalty": self.presence_penalty,
|
"presence_penalty": self.presence_penalty,
|
||||||
"stop": self.stop,
|
"stop": self.stop,
|
||||||
"n": self.n
|
"n": self.n,
|
||||||
|
"backoff_base_delay": self.backoff_base_delay,
|
||||||
|
"backoff_max_attempts": self.backoff_max_attempts,
|
||||||
|
"backoff_exponential_factor": self.backoff_exponential_factor
|
||||||
}
|
}
|
||||||
|
|
||||||
def clone(self, **kwargs):
|
def clone(self, **kwargs):
|
||||||
|
|||||||
@@ -617,11 +617,11 @@ class AsyncWebCrawler:
|
|||||||
else config.chunking_strategy
|
else config.chunking_strategy
|
||||||
)
|
)
|
||||||
sections = chunking.chunk(content)
|
sections = chunking.chunk(content)
|
||||||
# extracted_content = config.extraction_strategy.run(url, sections)
|
# extracted_content = config.extraction_strategy.run(_url, sections)
|
||||||
|
|
||||||
# Use async version if available for better parallelism
|
# Use async version if available for better parallelism
|
||||||
if hasattr(config.extraction_strategy, 'arun'):
|
if hasattr(config.extraction_strategy, 'arun'):
|
||||||
extracted_content = await config.extraction_strategy.arun(url, sections)
|
extracted_content = await config.extraction_strategy.arun(_url, sections)
|
||||||
else:
|
else:
|
||||||
# Fallback to sync version run in thread pool to avoid blocking
|
# Fallback to sync version run in thread pool to avoid blocking
|
||||||
extracted_content = await asyncio.to_thread(
|
extracted_content = await asyncio.to_thread(
|
||||||
|
|||||||
@@ -980,6 +980,9 @@ class LLMContentFilter(RelevantContentFilter):
|
|||||||
prompt,
|
prompt,
|
||||||
api_token,
|
api_token,
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
|
base_delay=self.llm_config.backoff_base_delay,
|
||||||
|
max_attempts=self.llm_config.backoff_max_attempts,
|
||||||
|
exponential_factor=self.llm_config.backoff_exponential_factor,
|
||||||
extra_args=extra_args,
|
extra_args=extra_args,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
if el.tag in bypass_tags:
|
if el.tag in bypass_tags:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip elements inside <pre> or <code> tags where whitespace is significant
|
||||||
|
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
|
||||||
|
is_in_code_block = False
|
||||||
|
ancestor = el.getparent()
|
||||||
|
while ancestor is not None:
|
||||||
|
if ancestor.tag in ("pre", "code"):
|
||||||
|
is_in_code_block = True
|
||||||
|
break
|
||||||
|
ancestor = ancestor.getparent()
|
||||||
|
|
||||||
|
if is_in_code_block:
|
||||||
|
continue
|
||||||
|
|
||||||
text_content = (el.text_content() or "").strip()
|
text_content = (el.text_content() or "").strip()
|
||||||
if (
|
if (
|
||||||
len(text_content.split()) < word_count_threshold
|
len(text_content.split()) < word_count_threshold
|
||||||
|
|||||||
@@ -649,6 +649,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
base_url=self.llm_config.base_url,
|
base_url=self.llm_config.base_url,
|
||||||
json_response=self.force_json_response,
|
json_response=self.force_json_response,
|
||||||
extra_args=self.extra_args,
|
extra_args=self.extra_args,
|
||||||
|
base_delay=self.llm_config.backoff_base_delay,
|
||||||
|
max_attempts=self.llm_config.backoff_max_attempts,
|
||||||
|
exponential_factor=self.llm_config.backoff_exponential_factor
|
||||||
) # , json_response=self.extract_type == "schema")
|
) # , json_response=self.extract_type == "schema")
|
||||||
# Track usage
|
# Track usage
|
||||||
usage = TokenUsage(
|
usage = TokenUsage(
|
||||||
@@ -846,6 +849,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
base_url=self.llm_config.base_url,
|
base_url=self.llm_config.base_url,
|
||||||
json_response=self.force_json_response,
|
json_response=self.force_json_response,
|
||||||
extra_args=self.extra_args,
|
extra_args=self.extra_args,
|
||||||
|
base_delay=self.llm_config.backoff_base_delay,
|
||||||
|
max_attempts=self.llm_config.backoff_max_attempts,
|
||||||
|
exponential_factor=self.llm_config.backoff_exponential_factor
|
||||||
)
|
)
|
||||||
# Track usage
|
# Track usage
|
||||||
usage = TokenUsage(
|
usage = TokenUsage(
|
||||||
|
|||||||
@@ -795,6 +795,9 @@ Return only a JSON array of extracted tables following the specified format."""
|
|||||||
api_token=self.llm_config.api_token,
|
api_token=self.llm_config.api_token,
|
||||||
base_url=self.llm_config.base_url,
|
base_url=self.llm_config.base_url,
|
||||||
json_response=True,
|
json_response=True,
|
||||||
|
base_delay=self.llm_config.backoff_base_delay,
|
||||||
|
max_attempts=self.llm_config.backoff_max_attempts,
|
||||||
|
exponential_factor=self.llm_config.backoff_exponential_factor,
|
||||||
extra_args=self.extra_args
|
extra_args=self.extra_args
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1116,6 +1119,9 @@ Return only a JSON array of extracted tables following the specified format."""
|
|||||||
api_token=self.llm_config.api_token,
|
api_token=self.llm_config.api_token,
|
||||||
base_url=self.llm_config.base_url,
|
base_url=self.llm_config.base_url,
|
||||||
json_response=True,
|
json_response=True,
|
||||||
|
base_delay=self.llm_config.backoff_base_delay,
|
||||||
|
max_attempts=self.llm_config.backoff_max_attempts,
|
||||||
|
exponential_factor=self.llm_config.backoff_exponential_factor,
|
||||||
extra_args=self.extra_args
|
extra_args=self.extra_args
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1745,6 +1745,9 @@ def perform_completion_with_backoff(
|
|||||||
api_token,
|
api_token,
|
||||||
json_response=False,
|
json_response=False,
|
||||||
base_url=None,
|
base_url=None,
|
||||||
|
base_delay=2,
|
||||||
|
max_attempts=3,
|
||||||
|
exponential_factor=2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -1761,6 +1764,9 @@ def perform_completion_with_backoff(
|
|||||||
api_token (str): The API token for authentication.
|
api_token (str): The API token for authentication.
|
||||||
json_response (bool): Whether to request a JSON response. Defaults to False.
|
json_response (bool): Whether to request a JSON response. Defaults to False.
|
||||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||||
|
base_delay (int): The base delay in seconds. Defaults to 2.
|
||||||
|
max_attempts (int): The maximum number of attempts. Defaults to 3.
|
||||||
|
exponential_factor (int): The exponential factor. Defaults to 2.
|
||||||
**kwargs: Additional arguments for the API request.
|
**kwargs: Additional arguments for the API request.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -1770,9 +1776,6 @@ def perform_completion_with_backoff(
|
|||||||
from litellm import completion
|
from litellm import completion
|
||||||
from litellm.exceptions import RateLimitError
|
from litellm.exceptions import RateLimitError
|
||||||
|
|
||||||
max_attempts = 3
|
|
||||||
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
|
|
||||||
|
|
||||||
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
|
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
|
||||||
if json_response:
|
if json_response:
|
||||||
extra_args["response_format"] = {"type": "json_object"}
|
extra_args["response_format"] = {"type": "json_object"}
|
||||||
@@ -1798,7 +1801,7 @@ def perform_completion_with_backoff(
|
|||||||
# Check if we have exhausted our max attempts
|
# Check if we have exhausted our max attempts
|
||||||
if attempt < max_attempts - 1:
|
if attempt < max_attempts - 1:
|
||||||
# Calculate the delay and wait
|
# Calculate the delay and wait
|
||||||
delay = base_delay * (2**attempt) # Exponential backoff formula
|
delay = base_delay * (exponential_factor**attempt) # Exponential backoff formula
|
||||||
print(f"Waiting for {delay} seconds before retrying...")
|
print(f"Waiting for {delay} seconds before retrying...")
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
else:
|
else:
|
||||||
@@ -1831,6 +1834,9 @@ async def aperform_completion_with_backoff(
|
|||||||
api_token,
|
api_token,
|
||||||
json_response=False,
|
json_response=False,
|
||||||
base_url=None,
|
base_url=None,
|
||||||
|
base_delay=2,
|
||||||
|
max_attempts=3,
|
||||||
|
exponential_factor=2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -1847,6 +1853,9 @@ async def aperform_completion_with_backoff(
|
|||||||
api_token (str): The API token for authentication.
|
api_token (str): The API token for authentication.
|
||||||
json_response (bool): Whether to request a JSON response. Defaults to False.
|
json_response (bool): Whether to request a JSON response. Defaults to False.
|
||||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||||
|
base_delay (int): The base delay in seconds. Defaults to 2.
|
||||||
|
max_attempts (int): The maximum number of attempts. Defaults to 3.
|
||||||
|
exponential_factor (int): The exponential factor. Defaults to 2.
|
||||||
**kwargs: Additional arguments for the API request.
|
**kwargs: Additional arguments for the API request.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -1857,9 +1866,6 @@ async def aperform_completion_with_backoff(
|
|||||||
from litellm.exceptions import RateLimitError
|
from litellm.exceptions import RateLimitError
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
max_attempts = 3
|
|
||||||
base_delay = 2 # Base delay in seconds, you can adjust this based on your needs
|
|
||||||
|
|
||||||
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
|
extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
|
||||||
if json_response:
|
if json_response:
|
||||||
extra_args["response_format"] = {"type": "json_object"}
|
extra_args["response_format"] = {"type": "json_object"}
|
||||||
@@ -1885,7 +1891,7 @@ async def aperform_completion_with_backoff(
|
|||||||
# Check if we have exhausted our max attempts
|
# Check if we have exhausted our max attempts
|
||||||
if attempt < max_attempts - 1:
|
if attempt < max_attempts - 1:
|
||||||
# Calculate the delay and wait
|
# Calculate the delay and wait
|
||||||
delay = base_delay * (2**attempt) # Exponential backoff formula
|
delay = base_delay * (exponential_factor**attempt) # Exponential backoff formula
|
||||||
print(f"Waiting for {delay} seconds before retrying...")
|
print(f"Waiting for {delay} seconds before retrying...")
|
||||||
await asyncio.sleep(delay)
|
await asyncio.sleep(delay)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -108,7 +108,10 @@ async def handle_llm_qa(
|
|||||||
prompt_with_variables=prompt,
|
prompt_with_variables=prompt,
|
||||||
api_token=get_llm_api_key(config), # Returns None to let litellm handle it
|
api_token=get_llm_api_key(config), # Returns None to let litellm handle it
|
||||||
temperature=get_llm_temperature(config),
|
temperature=get_llm_temperature(config),
|
||||||
base_url=get_llm_base_url(config)
|
base_url=get_llm_base_url(config),
|
||||||
|
base_delay=config["llm"].get("backoff_base_delay", 2),
|
||||||
|
max_attempts=config["llm"].get("backoff_max_attempts", 3),
|
||||||
|
exponential_factor=config["llm"].get("backoff_exponential_factor", 2)
|
||||||
)
|
)
|
||||||
|
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
|
|||||||
@@ -439,10 +439,19 @@ LLMConfig is useful to pass LLM provider config to strategies and functions that
|
|||||||
| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provider to use.
|
| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provider to use.
|
||||||
| **`api_token`** |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables <br/> 2. API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` <br/> 3. Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"` | API token to use for the given provider
|
| **`api_token`** |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables <br/> 2. API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` <br/> 3. Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"` | API token to use for the given provider
|
||||||
| **`base_url`** |Optional. Custom API endpoint | If your provider has a custom endpoint
|
| **`base_url`** |Optional. Custom API endpoint | If your provider has a custom endpoint
|
||||||
|
| **`backoff_base_delay`** |Optional. `int` *(default: `2`)* | Seconds to wait before the first retry when the provider throttles a request.
|
||||||
|
| **`backoff_max_attempts`** |Optional. `int` *(default: `3`)* | Total tries (initial call + retries) before surfacing an error.
|
||||||
|
| **`backoff_exponential_factor`** |Optional. `int` *(default: `2`)* | Multiplier that increases the wait time for each retry (`delay = base_delay * factor^attempt`).
|
||||||
|
|
||||||
## 3.2 Example Usage
|
## 3.2 Example Usage
|
||||||
```python
|
```python
|
||||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
llm_config = LLMConfig(
|
||||||
|
provider="openai/gpt-4o-mini",
|
||||||
|
api_token=os.getenv("OPENAI_API_KEY"),
|
||||||
|
backoff_base_delay=1, # optional
|
||||||
|
backoff_max_attempts=5, # optional
|
||||||
|
backoff_exponential_factor=3, # optional
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
## 4. Putting It All Together
|
## 4. Putting It All Together
|
||||||
|
|||||||
@@ -1593,8 +1593,20 @@ The `clone()` method:
|
|||||||
- Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`
|
- Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`
|
||||||
3. **`base_url`**:
|
3. **`base_url`**:
|
||||||
- If your provider has a custom endpoint
|
- If your provider has a custom endpoint
|
||||||
|
|
||||||
|
4. **Backoff controls** *(optional)*:
|
||||||
|
- `backoff_base_delay` *(default `2` seconds)* – how long to pause before the first retry if the provider rate-limits you.
|
||||||
|
- `backoff_max_attempts` *(default `3`)* – total tries for the same prompt (initial call + retries).
|
||||||
|
- `backoff_exponential_factor` *(default `2`)* – how quickly the pause grows between retries. A factor of 2 yields waits like 2s → 4s → 8s.
|
||||||
|
- Because these plug into Crawl4AI’s retry helper, every LLM strategy automatically follows the pacing you define here.
|
||||||
```python
|
```python
|
||||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
llm_config = LLMConfig(
|
||||||
|
provider="openai/gpt-4o-mini",
|
||||||
|
api_token=os.getenv("OPENAI_API_KEY"),
|
||||||
|
backoff_base_delay=1, # optional
|
||||||
|
backoff_max_attempts=5, # optional
|
||||||
|
backoff_exponential_factor=3, # optional
|
||||||
|
)
|
||||||
```
|
```
|
||||||
## 4. Putting It All Together
|
## 4. Putting It All Together
|
||||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs:
|
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs:
|
||||||
|
|||||||
@@ -308,8 +308,20 @@ The `clone()` method:
|
|||||||
3.⠀**`base_url`**:
|
3.⠀**`base_url`**:
|
||||||
- If your provider has a custom endpoint
|
- If your provider has a custom endpoint
|
||||||
|
|
||||||
|
4.⠀**Retry/backoff controls** *(optional)*:
|
||||||
|
- `backoff_base_delay` *(default `2` seconds)* – base delay inserted before the first retry when the provider returns a rate-limit response.
|
||||||
|
- `backoff_max_attempts` *(default `3`)* – total number of attempts (initial call plus retries) before the request is surfaced as an error.
|
||||||
|
- `backoff_exponential_factor` *(default `2`)* – growth rate for the retry delay (`delay = base_delay * factor^attempt`).
|
||||||
|
- These values are forwarded to the shared `perform_completion_with_backoff` helper, ensuring every strategy that consumes your `LLMConfig` honors the same throttling policy.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
llm_config = LLMConfig(
|
||||||
|
provider="openai/gpt-4o-mini",
|
||||||
|
api_token=os.getenv("OPENAI_API_KEY"),
|
||||||
|
backoff_base_delay=1, # optional
|
||||||
|
backoff_max_attempts=5, # optional
|
||||||
|
backoff_exponential_factor=3, #optional
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
## 4. Putting It All Together
|
## 4. Putting It All Together
|
||||||
|
|||||||
@@ -9,6 +9,21 @@ from crawl4ai import (
|
|||||||
RateLimiter,
|
RateLimiter,
|
||||||
CacheMode
|
CacheMode
|
||||||
)
|
)
|
||||||
|
from crawl4ai.extraction_strategy import ExtractionStrategy
|
||||||
|
|
||||||
|
class MockExtractionStrategy(ExtractionStrategy):
|
||||||
|
"""Mock extraction strategy for testing URL parameter handling"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.run_calls = []
|
||||||
|
|
||||||
|
def extract(self, url: str, html: str, *args, **kwargs):
|
||||||
|
return [{"test": "data"}]
|
||||||
|
|
||||||
|
def run(self, url: str, sections: List[str], *args, **kwargs):
|
||||||
|
self.run_calls.append(url)
|
||||||
|
return super().run(url, sections, *args, **kwargs)
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("viewport", [
|
@pytest.mark.parametrize("viewport", [
|
||||||
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
|
|||||||
assert not result.success
|
assert not result.success
|
||||||
assert result.error_message is not None
|
assert result.error_message is not None
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extraction_strategy_run_with_regular_url():
|
||||||
|
"""
|
||||||
|
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
|
||||||
|
|
||||||
|
This test verifies that when is_raw_html=False (regular URL),
|
||||||
|
extraction_strategy.run is called with the actual URL.
|
||||||
|
"""
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
mock_strategy = MockExtractionStrategy()
|
||||||
|
|
||||||
|
# Test regular URL (is_raw_html=False)
|
||||||
|
regular_url = "https://example.com"
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=regular_url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
page_timeout=30000,
|
||||||
|
extraction_strategy=mock_strategy,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert len(mock_strategy.run_calls) == 1
|
||||||
|
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extraction_strategy_run_with_raw_html():
|
||||||
|
"""
|
||||||
|
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
|
||||||
|
|
||||||
|
This test verifies that when is_raw_html=True (URL starts with "raw:"),
|
||||||
|
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
|
||||||
|
"""
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
mock_strategy = MockExtractionStrategy()
|
||||||
|
|
||||||
|
# Test raw HTML URL (is_raw_html=True automatically set)
|
||||||
|
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=raw_html_url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
page_timeout=30000,
|
||||||
|
extraction_strategy=mock_strategy,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert len(mock_strategy.run_calls) == 1
|
||||||
|
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(test_viewport_config((1024, 768)))
|
asyncio.run(test_viewport_config((1024, 768)))
|
||||||
asyncio.run(test_memory_management())
|
asyncio.run(test_memory_management())
|
||||||
asyncio.run(test_rate_limiting())
|
asyncio.run(test_rate_limiting())
|
||||||
asyncio.run(test_javascript_execution())
|
asyncio.run(test_javascript_execution())
|
||||||
|
asyncio.run(test_extraction_strategy_run_with_regular_url())
|
||||||
|
asyncio.run(test_extraction_strategy_run_with_raw_html())
|
||||||
|
|||||||
Reference in New Issue
Block a user