feat: Enhance crawling control and LLM extraction flexibility

- Add before_retrieve_html hook and delay_before_return_html option
- Implement flexible page_timeout for smart_wait function
- Support extra_args and custom headers in LLM extraction
- Allow arbitrary kwargs in AsyncWebCrawler initialization
- Improve perform_completion_with_backoff for custom API calls
- Update examples with new features and diverse LLM providers
This commit is contained in:
unclecode
2024-10-12 14:48:22 +08:00
parent 9b2b267820
commit 68e9144ce3
6 changed files with 76 additions and 12 deletions

View File

@@ -63,7 +63,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
'on_execution_started': None,
'before_goto': None,
'after_goto': None,
'before_return_html': None
'before_return_html': None,
'before_retrieve_html': None
}
async def __aenter__(self):
@@ -295,7 +296,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
wait_for = kwargs.get("wait_for")
if wait_for:
try:
await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
except Exception as e:
raise RuntimeError(f"Wait condition failed: {str(e)}")
@@ -304,8 +305,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if kwargs.get("screenshot"):
screenshot_data = await self.take_screenshot(url)
await self.execute_hook('before_retrieve_html', page)
# Check if delay_before_return_html is set then wait for that time
delay_before_return_html = kwargs.get("delay_before_return_html")
if delay_before_return_html:
await asyncio.sleep(delay_before_return_html)
html = await page.content()
page = await self.execute_hook('before_return_html', page, html)
await self.execute_hook('before_return_html', page, html)
if self.verbose:
print(f"[LOG] ✅ Crawled {url} successfully!")

View File

@@ -23,17 +23,17 @@ class AsyncWebCrawler:
self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
always_by_pass_cache: bool = False,
verbose: bool = False,
**kwargs,
):
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
verbose=verbose
**kwargs
)
self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
self.ready = False
self.verbose = verbose
self.verbose = kwargs.get("verbose", False)
async def __aenter__(self):
await self.crawler_strategy.__aenter__()

View File

@@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
self.apply_chunking = kwargs.get("apply_chunking", True)
self.base_url = kwargs.get("base_url", None)
self.extra_args = kwargs.get("extra_args", {})
if not self.apply_chunking:
self.chunk_token_threshold = 1e9
@@ -111,7 +112,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
"{" + variable + "}", variable_values[variable]
)
response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
response = perform_completion_with_backoff(
self.provider,
prompt_with_variables,
self.api_token,
base_url=self.base_url,
extra_args = self.extra_args
) # , json_response=self.extract_type == "schema")
try:
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
blocks = json.loads(blocks)

View File

@@ -775,7 +775,14 @@ def extract_xml_data(tags, string):
return data
# Function to perform the completion with exponential backoff
def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
def perform_completion_with_backoff(
provider,
prompt_with_variables,
api_token,
json_response = False,
base_url=None,
**kwargs
):
from litellm import completion
from litellm.exceptions import RateLimitError
max_attempts = 3
@@ -784,6 +791,9 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
extra_args = {}
if json_response:
extra_args["response_format"] = { "type": "json_object" }
if kwargs.get("extra_args"):
extra_args.update(kwargs["extra_args"])
for attempt in range(max_attempts):
try: