feat: Enhance crawling control and LLM extraction flexibility

- Add before_retrieve_html hook and delay_before_return_html option
- Implement flexible page_timeout for smart_wait function
- Support extra_args and custom headers in LLM extraction
- Allow arbitrary kwargs in AsyncWebCrawler initialization
- Improve perform_completion_with_backoff for custom API calls
- Update examples with new features and diverse LLM providers
This commit is contained in:
unclecode
2024-10-12 14:48:22 +08:00
parent 9b2b267820
commit 68e9144ce3
6 changed files with 76 additions and 12 deletions

View File

@@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
self.apply_chunking = kwargs.get("apply_chunking", True)
self.base_url = kwargs.get("base_url", None)
self.extra_args = kwargs.get("extra_args", {})
if not self.apply_chunking:
self.chunk_token_threshold = 1e9
@@ -111,7 +112,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
"{" + variable + "}", variable_values[variable]
)
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
response = perform_completion_with_backoff(
self.provider,
prompt_with_variables,
self.api_token,
base_url=self.base_url,
extra_args = self.extra_args
) # , json_response=self.extract_type == "schema")
try:
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
blocks = json.loads(blocks)