diff --git a/CHANGELOG.md b/CHANGELOG.md index 873af87f..197fa32b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,36 @@ # Changelog -## [v0.3.6] - 2024-10-12 +## [v0.3.6] - 2024-10-12 - Part 1 + +### 1. Improved Crawling Control +- **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`. +- **Delayed HTML Retrieval**: Introduced `delay_before_return_html` parameter to allow waiting before retrieving HTML content. + - Useful for pages with delayed content loading. +- **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout. + - Provides better handling for slow-loading pages. + +### 2. Enhanced LLM Extraction Strategy +- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. +- **Custom Headers**: Users can now pass custom headers to the extraction strategy. + - Enables more flexibility when interacting with different LLM APIs. + +### 3. AsyncWebCrawler Improvements +- **Flexible Initialization**: `AsyncWebCrawler` now accepts arbitrary keyword arguments. + - These are passed directly to the crawler strategy, allowing for more customized setups. + +### 4. Utility Function Enhancements +- **Improved API Interaction**: `perform_completion_with_backoff` function now supports additional arguments. + - Allows for more customized API calls to LLM providers. + +## Examples and Documentation +- Updated `quickstart_async.py` with examples of using custom headers in LLM extraction. +- Added more diverse examples of LLM provider usage, including OpenAI, Hugging Face, and Ollama. + +## Developer Notes +- Refactored code for better maintainability and flexibility. +- Enhanced error handling and logging for improved debugging experience. + +## [v0.3.6] - 2024-10-12 - Part 2 ### 1. Screenshot Capture - **What's new**: Added ability to capture screenshots during crawling. @@ -45,7 +75,6 @@ - Added examples for using different LLM providers in `quickstart_async.py`. - Enhanced type hinting throughout the codebase for better development experience. -We're constantly working to improve crawl4ai. These updates aim to provide you with more control, flexibility, and reliability in your web crawling tasks. As always, we appreciate your feedback and suggestions for future improvements! ## [v0.3.5] - 2024-09-02 diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 28795a3e..c74aff13 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -63,7 +63,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'on_execution_started': None, 'before_goto': None, 'after_goto': None, - 'before_return_html': None + 'before_return_html': None, + 'before_retrieve_html': None } async def __aenter__(self): @@ -295,7 +296,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): wait_for = kwargs.get("wait_for") if wait_for: try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000)) + await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") @@ -304,8 +305,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("screenshot"): screenshot_data = await self.take_screenshot(url) + await self.execute_hook('before_retrieve_html', page) + # Check if delay_before_return_html is set then wait for that time + delay_before_return_html = kwargs.get("delay_before_return_html") + if delay_before_return_html: + await asyncio.sleep(delay_before_return_html) html = await page.content() - page = await self.execute_hook('before_return_html', page, html) + await self.execute_hook('before_return_html', page, html) if self.verbose: print(f"[LOG] ✅ Crawled {url} successfully!") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d308e930..ba82d28f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -23,17 +23,17 @@ class AsyncWebCrawler: self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, always_by_pass_cache: bool = False, - verbose: bool = False, + **kwargs, ): self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - verbose=verbose + **kwargs ) self.always_by_pass_cache = always_by_pass_cache self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) self.ready = False - self.verbose = verbose + self.verbose = kwargs.get("verbose", False) async def __aenter__(self): await self.crawler_strategy.__aenter__() diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 48491067..210a360b 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy): self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) self.apply_chunking = kwargs.get("apply_chunking", True) self.base_url = kwargs.get("base_url", None) + self.extra_args = kwargs.get("extra_args", {}) if not self.apply_chunking: self.chunk_token_threshold = 1e9 @@ -111,7 +112,13 @@ class LLMExtractionStrategy(ExtractionStrategy): "{" + variable + "}", variable_values[variable] ) - response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema") + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") try: blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] blocks = json.loads(blocks) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 71a36aed..77671a20 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -775,7 +775,14 @@ def extract_xml_data(tags, string): return data # Function to perform the completion with exponential backoff -def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None): +def perform_completion_with_backoff( + provider, + prompt_with_variables, + api_token, + json_response = False, + base_url=None, + **kwargs + ): from litellm import completion from litellm.exceptions import RateLimitError max_attempts = 3 @@ -784,6 +791,9 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token, extra_args = {} if json_response: extra_args["response_format"] = { "type": "json_object" } + + if kwargs.get("extra_args"): + extra_args.update(kwargs["extra_args"]) for attempt in range(max_attempts): try: diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 836bdb1d..9f00e323 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -96,13 +96,17 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) -async def extract_structured_data_using_llm(provider: str, api_token: str = None): +async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): print(f"\n--- Extracting Structured Data with {provider} ---") if api_token is None and provider != "ollama": print(f"API token is required for {provider}. Skipping this example.") return + extra_args = {} + if extra_headers: + extra_args["extra_headers"] = extra_headers + async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url="https://openai.com/api/pricing/", @@ -115,6 +119,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. Do not miss any models in the entire content. One extracted model JSON format should look like this: {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", + extra_args=extra_args ), bypass_cache=True, ) @@ -414,9 +419,16 @@ async def main(): # LLM extraction examples await extract_structured_data_using_llm() - await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("ollama/llama3.2") + + # You always can pass custom headers to the extraction strategy + custom_headers = { + "Authorization": "Bearer your-custom-token", + "X-Custom-Header": "Some-Value" + } + await extract_structured_data_using_llm(extra_headers=custom_headers) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2()