feat: Enhance crawling control and LLM extraction flexibility

- Add before_retrieve_html hook and delay_before_return_html option
- Implement flexible page_timeout for smart_wait function
- Support extra_args and custom headers in LLM extraction
- Allow arbitrary kwargs in AsyncWebCrawler initialization
- Improve perform_completion_with_backoff for custom API calls
- Update examples with new features and diverse LLM providers
This commit is contained in:
unclecode
2024-10-12 14:48:22 +08:00
parent 9b2b267820
commit 68e9144ce3
6 changed files with 76 additions and 12 deletions

View File

@@ -1,6 +1,36 @@
# Changelog
## [v0.3.6] - 2024-10-12
## [v0.3.6] - 2024-10-12 - Part 1
### 1. Improved Crawling Control
- **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`.
- **Delayed HTML Retrieval**: Introduced `delay_before_return_html` parameter to allow waiting before retrieving HTML content.
- Useful for pages with delayed content loading.
- **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout.
- Provides better handling for slow-loading pages.
### 2. Enhanced LLM Extraction Strategy
- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter.
- **Custom Headers**: Users can now pass custom headers to the extraction strategy.
- Enables more flexibility when interacting with different LLM APIs.
### 3. AsyncWebCrawler Improvements
- **Flexible Initialization**: `AsyncWebCrawler` now accepts arbitrary keyword arguments.
- These are passed directly to the crawler strategy, allowing for more customized setups.
### 4. Utility Function Enhancements
- **Improved API Interaction**: `perform_completion_with_backoff` function now supports additional arguments.
- Allows for more customized API calls to LLM providers.
## Examples and Documentation
- Updated `quickstart_async.py` with examples of using custom headers in LLM extraction.
- Added more diverse examples of LLM provider usage, including OpenAI, Hugging Face, and Ollama.
## Developer Notes
- Refactored code for better maintainability and flexibility.
- Enhanced error handling and logging for improved debugging experience.
## [v0.3.6] - 2024-10-12 - Part 2
### 1. Screenshot Capture
- **What's new**: Added ability to capture screenshots during crawling.
@@ -45,7 +75,6 @@
- Added examples for using different LLM providers in `quickstart_async.py`.
- Enhanced type hinting throughout the codebase for better development experience.
We're constantly working to improve crawl4ai. These updates aim to provide you with more control, flexibility, and reliability in your web crawling tasks. As always, we appreciate your feedback and suggestions for future improvements!
## [v0.3.5] - 2024-09-02

View File

@@ -63,7 +63,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
'on_execution_started': None,
'before_goto': None,
'after_goto': None,
'before_return_html': None
'before_return_html': None,
'before_retrieve_html': None
}
async def __aenter__(self):
@@ -295,7 +296,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
wait_for = kwargs.get("wait_for")
if wait_for:
try:
await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
except Exception as e:
raise RuntimeError(f"Wait condition failed: {str(e)}")
@@ -304,8 +305,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if kwargs.get("screenshot"):
screenshot_data = await self.take_screenshot(url)
await self.execute_hook('before_retrieve_html', page)
# Check if delay_before_return_html is set then wait for that time
delay_before_return_html = kwargs.get("delay_before_return_html")
if delay_before_return_html:
await asyncio.sleep(delay_before_return_html)
html = await page.content()
page = await self.execute_hook('before_return_html', page, html)
await self.execute_hook('before_return_html', page, html)
if self.verbose:
print(f"[LOG] ✅ Crawled {url} successfully!")

View File

@@ -23,17 +23,17 @@ class AsyncWebCrawler:
self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
always_by_pass_cache: bool = False,
verbose: bool = False,
**kwargs,
):
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
verbose=verbose
**kwargs
)
self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
self.ready = False
self.verbose = verbose
self.verbose = kwargs.get("verbose", False)
async def __aenter__(self):
await self.crawler_strategy.__aenter__()

View File

@@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
self.apply_chunking = kwargs.get("apply_chunking", True)
self.base_url = kwargs.get("base_url", None)
self.extra_args = kwargs.get("extra_args", {})
if not self.apply_chunking:
self.chunk_token_threshold = 1e9
@@ -111,7 +112,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
"{" + variable + "}", variable_values[variable]
)
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
response = perform_completion_with_backoff(
self.provider,
prompt_with_variables,
self.api_token,
base_url=self.base_url,
extra_args = self.extra_args
) # , json_response=self.extract_type == "schema")
try:
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
blocks = json.loads(blocks)

View File

@@ -775,7 +775,14 @@ def extract_xml_data(tags, string):
return data
# Function to perform the completion with exponential backoff
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
def perform_completion_with_backoff(
provider,
prompt_with_variables,
api_token,
json_response = False,
base_url=None,
**kwargs
):
from litellm import completion
from litellm.exceptions import RateLimitError
max_attempts = 3
@@ -784,6 +791,9 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
extra_args = {}
if json_response:
extra_args["response_format"] = { "type": "json_object" }
if kwargs.get("extra_args"):
extra_args.update(kwargs["extra_args"])
for attempt in range(max_attempts):
try:

View File

@@ -96,13 +96,17 @@ class OpenAIModelFee(BaseModel):
..., description="Fee for output token for the OpenAI model."
)
async def extract_structured_data_using_llm(provider: str, api_token: str = None):
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
print(f"\n--- Extracting Structured Data with {provider} ---")
if api_token is None and provider != "ollama":
print(f"API token is required for {provider}. Skipping this example.")
return
extra_args = {}
if extra_headers:
extra_args["extra_headers"] = extra_headers
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://openai.com/api/pricing/",
@@ -115,6 +119,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content. One extracted model JSON format should look like this:
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
extra_args=extra_args
),
bypass_cache=True,
)
@@ -414,9 +419,16 @@ async def main():
# LLM extraction examples
await extract_structured_data_using_llm()
await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY"))
await extract_structured_data_using_llm("ollama/llama3.2")
# You always can pass custom headers to the extraction strategy
custom_headers = {
"Authorization": "Bearer your-custom-token",
"X-Custom-Header": "Some-Value"
}
await extract_structured_data_using_llm(extra_headers=custom_headers)
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()