diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3278c731..ffc7626f 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -24,7 +24,7 @@ from .browser_manager import BrowserManager import aiofiles import aiohttp -import cchardet +import chardet from aiohttp.client import ClientTimeout from urllib.parse import urlparse from types import MappingProxyType @@ -130,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Close the browser and clean up resources. """ await self.browser_manager.close() + # Explicitly reset the static Playwright instance + BrowserManager._playwright_instance = None async def kill_session(self, session_id: str): """ @@ -679,14 +681,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if console_log_type == "error": self.logger.error( message=f"Console error: {msg}", # Use f-string for variable interpolation - tag="CONSOLE", - params={"msg": msg.text}, + tag="CONSOLE" ) elif console_log_type == "debug": self.logger.debug( message=f"Console: {msg}", # Use f-string for variable interpolation - tag="CONSOLE", - params={"msg": msg.text}, + tag="CONSOLE" ) page.on("console", log_consol) @@ -967,7 +967,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for selector in selectors: try: - content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''") + content = await page.evaluate( + f"""Array.from(document.querySelectorAll("{selector}")) + .map(el => el.outerHTML) + .join('')""" + ) html_parts.append(content) except Error as e: print(f"Warning: Could not get content for selector '{selector}': {str(e)}") @@ -1975,7 +1979,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): await self.start() yield self._session finally: - await self.close() + pass def set_hook(self, hook_type: str, hook_func: Callable) -> None: if hook_type in self.hooks: @@ -2091,7 +2095,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): encoding = response.charset if not encoding: - encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' + encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8' result = AsyncCrawlResponse( html=content.tobytes().decode(encoding, errors='replace'), diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index 541f755a..76a1a8e7 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -4,6 +4,7 @@ from typing import Optional, Dict, Any from colorama import Fore, Style, init import os from datetime import datetime +from urllib.parse import unquote class LogLevel(Enum): @@ -44,11 +45,11 @@ class AsyncLoggerBase(ABC): pass @abstractmethod - def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): + def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100): pass @abstractmethod - def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): + def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100): pass class AsyncLogger(AsyncLoggerBase): @@ -130,6 +131,14 @@ class AsyncLogger(AsyncLoggerBase): def _get_icon(self, tag: str) -> str: """Get the icon for a tag, defaulting to info icon if not found.""" return self.icons.get(tag, self.icons["INFO"]) + + def _shorten(self, text, length, placeholder="..."): + """Truncate text in the middle if longer than length, or pad if shorter.""" + if len(text) <= length: + return text.ljust(length) # Pad with spaces to reach desired length + half = (length - len(placeholder)) // 2 + shortened = text[:half] + placeholder + text[-half:] + return shortened.ljust(length) # Also pad shortened text to consistent length def _write_to_file(self, message: str): """Write a message to the log file if configured.""" @@ -259,7 +268,7 @@ class AsyncLogger(AsyncLoggerBase): success: bool, timing: float, tag: str = "FETCH", - url_length: int = 50, + url_length: int = 100, ): """ Convenience method for logging URL fetch status. @@ -271,14 +280,15 @@ class AsyncLogger(AsyncLoggerBase): tag: Tag for the message url_length: Maximum length for URL in log """ + decoded_url = unquote(url) + readable_url = self._shorten(decoded_url, url_length) self._log( level=LogLevel.SUCCESS if success else LogLevel.ERROR, - message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + message="{url} | {status} | ⏱: {timing:.2f}s", tag=tag, params={ - "url": url, - "url_length": url_length, - "status": success, + "url": readable_url, + "status": "✓" if success else "✗", "timing": timing, }, colors={ @@ -299,11 +309,13 @@ class AsyncLogger(AsyncLoggerBase): tag: Tag for the message url_length: Maximum length for URL in log """ + decoded_url = unquote(url) + readable_url = self._shorten(decoded_url, url_length) self._log( level=LogLevel.ERROR, - message="{url:.{url_length}}... | Error: {error}", + message="{url} | Error: {error}", tag=tag, - params={"url": url, "url_length": url_length, "error": error}, + params={"url": readable_url, "error": error}, ) class AsyncFileLogger(AsyncLoggerBase): @@ -347,13 +359,13 @@ class AsyncFileLogger(AsyncLoggerBase): """Log an error message to file.""" self._write_to_file("ERROR", message, tag) - def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): + def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100): """Log URL fetch status to file.""" status = "SUCCESS" if success else "FAILED" message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s" self._write_to_file("URL_STATUS", message, tag) - def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): + def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100): """Log error status to file.""" message = f"{url[:url_length]}... | Error: {error}" self._write_to_file("ERROR", message, tag) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 8940b8ab..98acfd12 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -358,10 +358,11 @@ class AsyncWebCrawler: html=html, extracted_content=extracted_content, config=config, # Pass the config object instead of individual parameters - screenshot=screenshot_data, + screenshot_data=screenshot_data, pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, + redirected_url=async_response.redirected_url, **kwargs, ) @@ -380,18 +381,11 @@ class AsyncWebCrawler: crawl_result.session_id = getattr( config, "session_id", None) - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", + self.logger.url_status( + url=cache_context.display_url, + success=crawl_result.success, + timing=time.perf_counter() - start_time, tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": crawl_result.success, - "timing": f"{time.perf_counter() - start_time:.2f}s", - }, - colors={ - "status": Fore.GREEN if crawl_result.success else Fore.RED, - "timing": Fore.YELLOW, - }, ) # Update cache if appropriate @@ -401,17 +395,12 @@ class AsyncWebCrawler: return CrawlResultContainer(crawl_result) else: - self.logger.success( - message="{url:.50}... | Status: {status} | Total: {timing}", - tag="COMPLETE", - params={ - "url": cache_context.display_url, - "status": True, - "timing": f"{time.perf_counter() - start_time:.2f}s", - }, - colors={"status": Fore.GREEN, "timing": Fore.YELLOW}, + self.logger.url_status( + url=cache_context.display_url, + success=True, + timing=time.perf_counter() - start_time, + tag="COMPLETE" ) - cached_result.success = bool(html) cached_result.session_id = getattr( config, "session_id", None) @@ -446,7 +435,7 @@ class AsyncWebCrawler: html: str, extracted_content: str, config: CrawlerRunConfig, - screenshot: str, + screenshot_data: str, pdf_data: str, verbose: bool, **kwargs, @@ -459,7 +448,7 @@ class AsyncWebCrawler: html: Raw HTML content extracted_content: Previously extracted content (if any) config: Configuration object controlling processing behavior - screenshot: Screenshot data (if any) + screenshot_data: Screenshot data (if any) pdf_data: PDF data (if any) verbose: Whether to enable verbose logging **kwargs: Additional parameters for backwards compatibility @@ -564,20 +553,23 @@ class AsyncWebCrawler: markdown_result: MarkdownGenerationResult = ( markdown_generator.generate_markdown( input_html=markdown_input_html, - base_url=url, + base_url=params.get("redirected_url", url) # html2text_options=kwargs.get('html2text', {}) ) ) # Log processing completion - self.logger.info( - message="{url:.50}... | Time: {timing}s", - tag="SCRAPE", - params={ - "url": _url, - "timing": int((time.perf_counter() - t1) * 1000) / 1000, - }, + self.logger.url_status( + url=_url, + success=True, + timing=int((time.perf_counter() - t1) * 1000) / 1000, + tag="SCRAPE" ) + # self.logger.info( + # message="{url:.50}... | Time: {timing}s", + # tag="SCRAPE", + # params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, + # ) ################################ # Structured Content Extraction # @@ -624,10 +616,6 @@ class AsyncWebCrawler: params={"url": _url, "timing": time.perf_counter() - t1}, ) - # Handle screenshot and PDF data - screenshot_data = None if not screenshot else screenshot - pdf_data = None if not pdf_data else pdf_data - # Apply HTML formatting if requested if config.prettiify: cleaned_html = fast_format_html(cleaned_html) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index d6cf7b8c..1dfbce84 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -28,6 +28,7 @@ from lxml import etree from lxml import html as lhtml from typing import List from .models import ScrapingResult, MediaItem, Link, Media, Links +import copy # Pre-compile regular expressions for Open Graph and Twitter metadata OG_REGEX = re.compile(r"^og:") @@ -48,7 +49,7 @@ def parse_srcset(s: str) -> List[Dict]: if len(parts) >= 1: url = parts[0] width = ( - parts[1].rstrip("w") + parts[1].rstrip("w").split('.')[0] if len(parts) > 1 and parts[1].endswith("w") else None ) @@ -128,7 +129,8 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: ScrapingResult: A structured result containing the scraped content. """ - raw_result = self._scrap(url, html, is_async=False, **kwargs) + actual_url = kwargs.get("redirected_url", url) + raw_result = self._scrap(actual_url, html, is_async=False, **kwargs) if raw_result is None: return ScrapingResult( cleaned_html="", @@ -619,6 +621,9 @@ class WebScrapingStrategy(ContentScrapingStrategy): return False keep_element = False + # Special case for table elements - always preserve structure + if element.name in ["tr", "td", "th"]: + keep_element = True exclude_domains = kwargs.get("exclude_domains", []) # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) @@ -859,6 +864,8 @@ class WebScrapingStrategy(ContentScrapingStrategy): parser_type = kwargs.get("parser", "lxml") soup = BeautifulSoup(html, parser_type) body = soup.body + if body is None: + raise Exception("'
' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.") base_domain = get_base_domain(url) # Early removal of all images if exclude_all_images is set @@ -897,23 +904,6 @@ class WebScrapingStrategy(ContentScrapingStrategy): for element in body.select(excluded_selector): element.extract() - # if False and css_selector: - # selected_elements = body.select(css_selector) - # if not selected_elements: - # return { - # "markdown": "", - # "cleaned_html": "", - # "success": True, - # "media": {"images": [], "videos": [], "audios": []}, - # "links": {"internal": [], "external": []}, - # "metadata": {}, - # "message": f"No elements found for CSS selector: {css_selector}", - # } - # # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") - # body = soup.new_tag("div") - # for el in selected_elements: - # body.append(el) - content_element = None if target_elements: try: @@ -922,12 +912,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): for_content_targeted_element.extend(body.select(target_element)) content_element = soup.new_tag("div") for el in for_content_targeted_element: - content_element.append(el) + content_element.append(copy.deepcopy(el)) except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None else: - content_element = body + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -1308,6 +1298,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): "source", "track", "wbr", + "tr", + "td", + "th", } for el in reversed(list(root.iterdescendants())): @@ -1540,26 +1533,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE") meta = {} - # Handle CSS selector targeting - # if css_selector: - # try: - # selected_elements = body.cssselect(css_selector) - # if not selected_elements: - # return { - # "markdown": "", - # "cleaned_html": "", - # "success": True, - # "media": {"images": [], "videos": [], "audios": []}, - # "links": {"internal": [], "external": []}, - # "metadata": meta, - # "message": f"No elements found for CSS selector: {css_selector}", - # } - # body = lhtml.Element("div") - # body.extend(selected_elements) - # except Exception as e: - # self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") - # return None - content_element = None if target_elements: try: @@ -1567,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): for target_element in target_elements: for_content_targeted_element.extend(body.cssselect(target_element)) content_element = lhtml.Element("div") - content_element.extend(for_content_targeted_element) + content_element.extend(copy.deepcopy(for_content_targeted_element)) except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None @@ -1636,7 +1609,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): # Remove empty elements self.remove_empty_elements_fast(body, 1) - # Remvoe unneeded attributes + # Remove unneeded attributes self.remove_unwanted_attributes_fast( body, keep_data_attributes=kwargs.get("keep_data_attributes", False) ) diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 4811ba14..65d4e819 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -11,6 +11,7 @@ from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn +from ..utils import normalize_url_for_deep_crawl from math import inf as infinity @@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): valid_links = [] for link in links: url = link.get("href") - if url in visited: + base_url = normalize_url_for_deep_crawl(url, source_url) + if base_url in visited: continue if not await self.can_process_url(url, new_depth): self.stats.urls_skipped += 1 continue - valid_links.append(url) + valid_links.append(base_url) # If we have more valid links than capacity, limit them if len(valid_links) > remaining_capacity: diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 54b72ea3..48c116dd 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}") self.stats.urls_skipped += 1 continue - + + visited.add(base_url) valid_links.append((base_url, score)) # If we have more valid links than capacity, sort by score and take the top ones @@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): while current_level and not self._cancel_event.is_set(): next_level: List[Tuple[str, Optional[str]]] = [] urls = [url for url, _ in current_level] - visited.update(urls) # Clone the config to disable deep crawling recursion and enforce batch mode. batch_config = config.clone(deep_crawl_strategy=None, stream=False) diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js index 0400d89c..a50d9427 100644 --- a/crawl4ai/js_snippet/remove_overlay_elements.js +++ b/crawl4ai/js_snippet/remove_overlay_elements.js @@ -115,5 +115,6 @@ async () => { document.body.style.overflow = "auto"; // Wait a bit for any animations to complete - await new Promise((resolve) => setTimeout(resolve, 100)); + document.body.scrollIntoView(false); + await new Promise((resolve) => setTimeout(resolve, 50)); }; diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 02d105a9..67b61002 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -2003,6 +2003,10 @@ def normalize_url(href, base_url): if not parsed_base.scheme or not parsed_base.netloc: raise ValueError(f"Invalid base URL format: {base_url}") + # Ensure base_url ends with a trailing slash if it's a directory path + if not base_url.endswith('/'): + base_url = base_url + '/' + # Use urljoin to handle all cases normalized = urljoin(base_url, href.strip()) return normalized @@ -2047,7 +2051,7 @@ def normalize_url_for_deep_crawl(href, base_url): normalized = urlunparse(( parsed.scheme, netloc, - parsed.path.rstrip('/') or '/', # Normalize trailing slash + parsed.path.rstrip('/'), # Normalize trailing slash parsed.params, query, fragment @@ -2075,7 +2079,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url): normalized = urlunparse(( parsed.scheme, parsed.netloc.lower(), - parsed.path, + parsed.path.rstrip('/'), parsed.params, parsed.query, '' # Remove fragment diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 130b57d0..032ea45c 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -60,6 +60,8 @@ async def handle_llm_qa( ) -> str: """Process QA using LLM with crawled content as context.""" try: + if not url.startswith(('http://', 'https://')): + url = 'https://' + url # Extract base URL by finding last '?q=' occurrence last_q_index = url.rfind('?q=') if last_q_index != -1: @@ -73,7 +75,7 @@ async def handle_llm_qa( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=result.error_message ) - content = result.markdown.fit_markdown + content = result.markdown.fit_markdown or result.markdown.raw_markdown # Create prompt and get LLM response prompt = f"""Use the following content as context to answer the question. @@ -397,6 +399,7 @@ async def handle_crawl_request( peak_mem_mb = start_mem_mb try: + urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls] browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 7c02a74f..3cad8d05 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -432,7 +432,7 @@ async def execute_js( async def llm_endpoint( request: Request, url: str = Path(...), - q: Optional[str] = Query(None), + q: str = Query(...), _td: Dict = Depends(token_dep), ): if not q: diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md index 8522675c..bf11f8db 100644 --- a/docs/examples/full_page_screenshot_and_pdf_export.md +++ b/docs/examples/full_page_screenshot_and_pdf_export.md @@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page **Simple Example:** ```python -import os, sys +import os +import sys import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode +from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig # Adjust paths as needed parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -26,9 +27,11 @@ async def main(): # Request both PDF and screenshot result = await crawler.arun( url='https://en.wikipedia.org/wiki/List_of_common_misconceptions', - cache_mode=CacheMode.BYPASS, - pdf=True, - screenshot=True + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + pdf=True, + screenshot=True + ) ) if result.success: @@ -40,9 +43,8 @@ async def main(): # Save PDF if result.pdf: - pdf_bytes = b64decode(result.pdf) with open(os.path.join(__location__, "page.pdf"), "wb") as f: - f.write(pdf_bytes) + f.write(result.pdf) if __name__ == "__main__": asyncio.run(main()) diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 6cf771c1..c7ac21ae 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -232,6 +232,7 @@ async def main(): if __name__ == "__main__": asyncio.run(main()) +``` ## 2.4 Compliance & Ethics diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index b8817c6f..9ea8f2a1 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -36,8 +36,6 @@ class BrowserConfig: ### Key Fields to Note - - 1. **`browser_type`** - Options: `"chromium"`, `"firefox"`, or `"webkit"`. - Defaults to `"chromium"`. @@ -215,6 +213,7 @@ class CrawlerRunConfig: - The display mode for progress information (`DETAILED`, `BRIEF`, etc.). - Affects how much information is printed during the crawl. + ### Helper Methods The `clone()` method is particularly useful for creating variations of your crawler configuration: @@ -248,9 +247,6 @@ The `clone()` method: --- - - - ## 3. LLMConfig Essentials ### Key fields to note diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md index 785ff9b5..9f6a6b3e 100644 --- a/docs/md_v2/extraction/llm-strategies.md +++ b/docs/md_v2/extraction/llm-strategies.md @@ -2,7 +2,7 @@ In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that: -1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more). +1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more). 2. Automatically splits content into chunks (if desired) to handle token limits, then combines results. 3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach. @@ -18,13 +18,19 @@ In some cases, you need to extract **complex or unstructured** information from --- -## 2. Provider-Agnostic via LightLLM +## 2. Provider-Agnostic via LiteLLM -Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide: +You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters). + +```python +llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +``` + +Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide: - **`provider`**: The `