Merge PR #899 into next, resolve conflicts in server.py and docs/browser-crawler-config.md

2025-04-22 14:56:47 +08:00
parent 0007aea204 b27bb367e8
commit f3ebb38edf
16 changed files with 132 additions and 140 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -24,7 +24,7 @@ from .browser_manager import BrowserManager

 import aiofiles
 import aiohttp
-import cchardet
+import chardet
 from aiohttp.client import ClientTimeout
 from urllib.parse import urlparse
 from types import MappingProxyType
@@ -130,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        Close the browser and clean up resources.
        """
        await self.browser_manager.close()
+        # Explicitly reset the static Playwright instance
+        BrowserManager._playwright_instance = None

    async def kill_session(self, session_id: str):
        """
@@ -679,14 +681,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                if console_log_type == "error":
                    self.logger.error(
                        message=f"Console error: {msg}",  # Use f-string for variable interpolation
-                        tag="CONSOLE",
-                        params={"msg": msg.text},
+                        tag="CONSOLE"
                    )
                elif console_log_type == "debug":
                    self.logger.debug(
                        message=f"Console: {msg}",  # Use f-string for variable interpolation
-                        tag="CONSOLE",
-                        params={"msg": msg.text},
+                        tag="CONSOLE"
                    )

            page.on("console", log_consol)
@@ -967,7 +967,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    
                    for selector in selectors:
                        try:
-                            content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''")
+                            content = await page.evaluate(
+                                f"""Array.from(document.querySelectorAll("{selector}"))
+                                    .map(el => el.outerHTML)
+                                    .join('')"""
+                            )
                            html_parts.append(content)
                        except Error as e:
                            print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
@@ -1975,7 +1979,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
                await self.start()
            yield self._session
        finally:
-            await self.close()
+            pass

    def set_hook(self, hook_type: str, hook_func: Callable) -> None:
        if hook_type in self.hooks:
@@ -2091,7 +2095,7 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
                    
                    encoding = response.charset
                    if not encoding:
-                        encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
+                        encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
                    
                    result = AsyncCrawlResponse(
                        html=content.tobytes().decode(encoding, errors='replace'),
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -4,6 +4,7 @@ from typing import Optional, Dict, Any
 from colorama import Fore, Style, init
 import os
 from datetime import datetime
+from urllib.parse import unquote


 class LogLevel(Enum):
@@ -44,11 +45,11 @@ class AsyncLoggerBase(ABC):
        pass

    @abstractmethod
-    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
        pass

    @abstractmethod
-    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
        pass

 class AsyncLogger(AsyncLoggerBase):
@@ -130,6 +131,14 @@ class AsyncLogger(AsyncLoggerBase):
    def _get_icon(self, tag: str) -> str:
        """Get the icon for a tag, defaulting to info icon if not found."""
        return self.icons.get(tag, self.icons["INFO"])
+    
+    def _shorten(self, text, length, placeholder="..."):
+        """Truncate text in the middle if longer than length, or pad if shorter."""
+        if len(text) <= length:
+            return text.ljust(length)  # Pad with spaces to reach desired length
+        half = (length - len(placeholder)) // 2
+        shortened = text[:half] + placeholder + text[-half:]
+        return shortened.ljust(length)  # Also pad shortened text to consistent length

    def _write_to_file(self, message: str):
        """Write a message to the log file if configured."""
@@ -259,7 +268,7 @@ class AsyncLogger(AsyncLoggerBase):
        success: bool,
        timing: float,
        tag: str = "FETCH",
-        url_length: int = 50,
+        url_length: int = 100,
    ):
        """
        Convenience method for logging URL fetch status.
@@ -271,14 +280,15 @@ class AsyncLogger(AsyncLoggerBase):
            tag: Tag for the message
            url_length: Maximum length for URL in log
        """
+        decoded_url = unquote(url)
+        readable_url = self._shorten(decoded_url, url_length)
        self._log(
            level=LogLevel.SUCCESS if success else LogLevel.ERROR,
-            message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
+            message="{url} | {status} | ⏱: {timing:.2f}s",
            tag=tag,
            params={
-                "url": url,
-                "url_length": url_length,
-                "status": success,
+                "url": readable_url,
+                "status": "✓" if success else "✗",
                "timing": timing,
            },
            colors={
@@ -299,11 +309,13 @@ class AsyncLogger(AsyncLoggerBase):
            tag: Tag for the message
            url_length: Maximum length for URL in log
        """
+        decoded_url = unquote(url)
+        readable_url = self._shorten(decoded_url, url_length)
        self._log(
            level=LogLevel.ERROR,
-            message="{url:.{url_length}}... | Error: {error}",
+            message="{url} | Error: {error}",
            tag=tag,
-            params={"url": url, "url_length": url_length, "error": error},
+            params={"url": readable_url, "error": error},
        )

 class AsyncFileLogger(AsyncLoggerBase):
@@ -347,13 +359,13 @@ class AsyncFileLogger(AsyncLoggerBase):
        """Log an error message to file."""
        self._write_to_file("ERROR", message, tag)

-    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
        """Log URL fetch status to file."""
        status = "SUCCESS" if success else "FAILED"
        message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
        self._write_to_file("URL_STATUS", message, tag)

-    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
        """Log error status to file."""
        message = f"{url[:url_length]}... | Error: {error}"
        self._write_to_file("ERROR", message, tag)
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -358,10 +358,11 @@ class AsyncWebCrawler:
                        html=html,
                        extracted_content=extracted_content,
                        config=config,  # Pass the config object instead of individual parameters
-                        screenshot=screenshot_data,
+                        screenshot_data=screenshot_data,
                        pdf_data=pdf_data,
                        verbose=config.verbose,
                        is_raw_html=True if url.startswith("raw:") else False,
+                        redirected_url=async_response.redirected_url, 
                        **kwargs,
                    )

@@ -380,18 +381,11 @@ class AsyncWebCrawler:
                    crawl_result.session_id = getattr(
                        config, "session_id", None)

-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=crawl_result.success,
+                        timing=time.perf_counter() - start_time,
                        tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": crawl_result.success,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={
-                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
-                            "timing": Fore.YELLOW,
-                        },
                    )

                    # Update cache if appropriate
@@ -401,17 +395,12 @@ class AsyncWebCrawler:
                    return CrawlResultContainer(crawl_result)

                else:
-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
-                        tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": True,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=True,
+                        timing=time.perf_counter() - start_time,
+                        tag="COMPLETE"
                    )
-
                    cached_result.success = bool(html)
                    cached_result.session_id = getattr(
                        config, "session_id", None)
@@ -446,7 +435,7 @@ class AsyncWebCrawler:
        html: str,
        extracted_content: str,
        config: CrawlerRunConfig,
-        screenshot: str,
+        screenshot_data: str,
        pdf_data: str,
        verbose: bool,
        **kwargs,
@@ -459,7 +448,7 @@ class AsyncWebCrawler:
            html: Raw HTML content
            extracted_content: Previously extracted content (if any)
            config: Configuration object controlling processing behavior
-            screenshot: Screenshot data (if any)
+            screenshot_data: Screenshot data (if any)
            pdf_data: PDF data (if any)
            verbose: Whether to enable verbose logging
            **kwargs: Additional parameters for backwards compatibility
@@ -564,20 +553,23 @@ class AsyncWebCrawler:
        markdown_result: MarkdownGenerationResult = (
            markdown_generator.generate_markdown(
                input_html=markdown_input_html,
-                base_url=url,
+                base_url=params.get("redirected_url", url)
                # html2text_options=kwargs.get('html2text', {})
            )
        )

        # Log processing completion
-        self.logger.info(
-            message="{url:.50}... | Time: {timing}s",
-            tag="SCRAPE",
-            params={
-                "url": _url,
-                "timing": int((time.perf_counter() - t1) * 1000) / 1000,
-            },
+        self.logger.url_status(
+            url=_url,
+            success=True,
+            timing=int((time.perf_counter() - t1) * 1000) / 1000,
+            tag="SCRAPE"
        )
+        # self.logger.info(
+        #     message="{url:.50}... | Time: {timing}s",
+        #     tag="SCRAPE",
+        #     params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
+        # )

        ################################
        # Structured Content Extraction           #
@@ -624,10 +616,6 @@ class AsyncWebCrawler:
                params={"url": _url, "timing": time.perf_counter() - t1},
            )

-        # Handle screenshot and PDF data
-        screenshot_data = None if not screenshot else screenshot
-        pdf_data = None if not pdf_data else pdf_data
-
        # Apply HTML formatting if requested
        if config.prettiify:
            cleaned_html = fast_format_html(cleaned_html)
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -28,6 +28,7 @@ from lxml import etree
 from lxml import html as lhtml
 from typing import List
 from .models import ScrapingResult, MediaItem, Link, Media, Links
+import copy

 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r"^og:")
@@ -48,7 +49,7 @@ def parse_srcset(s: str) -> List[Dict]:
        if len(parts) >= 1:
            url = parts[0]
            width = (
-                parts[1].rstrip("w")
+                parts[1].rstrip("w").split('.')[0]
                if len(parts) > 1 and parts[1].endswith("w")
                else None
            )
@@ -128,7 +129,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        Returns:
            ScrapingResult: A structured result containing the scraped content.
        """
-        raw_result = self._scrap(url, html, is_async=False, **kwargs)
+        actual_url = kwargs.get("redirected_url", url)
+        raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
        if raw_result is None:
            return ScrapingResult(
                cleaned_html="",
@@ -619,6 +621,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                return False

            keep_element = False
+            # Special case for table elements - always preserve structure
+            if element.name in ["tr", "td", "th"]:
+                keep_element = True

            exclude_domains = kwargs.get("exclude_domains", [])
            # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
@@ -859,6 +864,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        parser_type = kwargs.get("parser", "lxml")
        soup = BeautifulSoup(html, parser_type)
        body = soup.body
+        if body is None:
+            raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
        base_domain = get_base_domain(url)
        
        # Early removal of all images if exclude_all_images is set
@@ -897,23 +904,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                for element in body.select(excluded_selector):
                    element.extract()

-        # if False and css_selector:
-        #     selected_elements = body.select(css_selector)
-        #     if not selected_elements:
-        #         return {
-        #             "markdown": "",
-        #             "cleaned_html": "",
-        #             "success": True,
-        #             "media": {"images": [], "videos": [], "audios": []},
-        #             "links": {"internal": [], "external": []},
-        #             "metadata": {},
-        #             "message": f"No elements found for CSS selector: {css_selector}",
-        #         }
-        #         # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
-        #     body = soup.new_tag("div")
-        #     for el in selected_elements:
-        #         body.append(el)
-
        content_element = None
        if target_elements:
            try:
@@ -922,12 +912,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                    for_content_targeted_element.extend(body.select(target_element))
                content_element = soup.new_tag("div")
                for el in for_content_targeted_element:
-                    content_element.append(el)
+                    content_element.append(copy.deepcopy(el))
            except Exception as e:
                self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                return None
        else:
-            content_element = body        
+            content_element = body     

        kwargs["exclude_social_media_domains"] = set(
            kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -1308,6 +1298,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            "source",
            "track",
            "wbr",
+            "tr",
+            "td",
+            "th",
        }

        for el in reversed(list(root.iterdescendants())):
@@ -1540,26 +1533,6 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
                meta = {}

-            # Handle CSS selector targeting
-            # if css_selector:
-            #     try:
-            #         selected_elements = body.cssselect(css_selector)
-            #         if not selected_elements:
-            #             return {
-            #                 "markdown": "",
-            #                 "cleaned_html": "",
-            #                 "success": True,
-            #                 "media": {"images": [], "videos": [], "audios": []},
-            #                 "links": {"internal": [], "external": []},
-            #                 "metadata": meta,
-            #                 "message": f"No elements found for CSS selector: {css_selector}",
-            #             }
-            #         body = lhtml.Element("div")
-            #         body.extend(selected_elements)
-            #     except Exception as e:
-            #         self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
-            #         return None
-
            content_element = None
            if target_elements:
                try:
@@ -1567,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                    for target_element in target_elements:
                        for_content_targeted_element.extend(body.cssselect(target_element))
                    content_element = lhtml.Element("div")
-                    content_element.extend(for_content_targeted_element)
+                    content_element.extend(copy.deepcopy(for_content_targeted_element))
                except Exception as e:
                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                    return None
@@ -1636,7 +1609,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
            # Remove empty elements
            self.remove_empty_elements_fast(body, 1)

-            # Remvoe unneeded attributes
+            # Remove unneeded attributes
            self.remove_unwanted_attributes_fast(
                body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
            )
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -11,6 +11,7 @@ from .scorers import URLScorer
 from . import DeepCrawlStrategy

 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+from ..utils import normalize_url_for_deep_crawl

 from math import inf as infinity

@@ -106,13 +107,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        valid_links = []
        for link in links:
            url = link.get("href")
-            if url in visited:
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
                continue
            if not await self.can_process_url(url, new_depth):
                self.stats.urls_skipped += 1
                continue
                
-            valid_links.append(url)
+            valid_links.append(base_url)
            
        # If we have more valid links than capacity, limit them
        if len(valid_links) > remaining_capacity:
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -117,7 +117,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
                self.stats.urls_skipped += 1
                continue
-            
+
+            visited.add(base_url)
            valid_links.append((base_url, score))
        
        # If we have more valid links than capacity, sort by score and take the top ones
@@ -158,7 +159,6 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        while current_level and not self._cancel_event.is_set():
            next_level: List[Tuple[str, Optional[str]]] = []
            urls = [url for url, _ in current_level]
-            visited.update(urls)

            # Clone the config to disable deep crawling recursion and enforce batch mode.
            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
--- a/crawl4ai/js_snippet/remove_overlay_elements.js
+++ b/crawl4ai/js_snippet/remove_overlay_elements.js
@@ -115,5 +115,6 @@ async () => {
    document.body.style.overflow = "auto";

    // Wait a bit for any animations to complete
-    await new Promise((resolve) => setTimeout(resolve, 100));
+    document.body.scrollIntoView(false);
+    await new Promise((resolve) => setTimeout(resolve, 50));
 };
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2003,6 +2003,10 @@ def normalize_url(href, base_url):
    if not parsed_base.scheme or not parsed_base.netloc:
        raise ValueError(f"Invalid base URL format: {base_url}")

+    # Ensure base_url ends with a trailing slash if it's a directory path
+    if not base_url.endswith('/'):
+        base_url = base_url + '/'
+
    # Use urljoin to handle all cases
    normalized = urljoin(base_url, href.strip())
    return normalized
@@ -2047,7 +2051,7 @@ def normalize_url_for_deep_crawl(href, base_url):
    normalized = urlunparse((
        parsed.scheme,
        netloc,
-        parsed.path.rstrip('/') or '/',  # Normalize trailing slash
+        parsed.path.rstrip('/'),  # Normalize trailing slash
        parsed.params,
        query,
        fragment
@@ -2075,7 +2079,7 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
    normalized = urlunparse((
        parsed.scheme,
        parsed.netloc.lower(),
-        parsed.path,
+        parsed.path.rstrip('/'),
        parsed.params,
        parsed.query,
        ''  # Remove fragment
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -60,6 +60,8 @@ async def handle_llm_qa(
 ) -> str:
    """Process QA using LLM with crawled content as context."""
    try:
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
        # Extract base URL by finding last '?q=' occurrence
        last_q_index = url.rfind('?q=')
        if last_q_index != -1:
@@ -73,7 +75,7 @@ async def handle_llm_qa(
                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                    detail=result.error_message
                )
-            content = result.markdown.fit_markdown
+            content = result.markdown.fit_markdown or result.markdown.raw_markdown

        # Create prompt and get LLM response
        prompt = f"""Use the following content as context to answer the question.
@@ -397,6 +399,7 @@ async def handle_crawl_request(
    peak_mem_mb = start_mem_mb
    
    try:
+        urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
        browser_config = BrowserConfig.load(browser_config)
        crawler_config = CrawlerRunConfig.load(crawler_config)

--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -432,7 +432,7 @@ async def execute_js(
 async def llm_endpoint(
    request: Request,
    url: str = Path(...),
-    q: Optional[str] = Query(None),
+    q: str = Query(...),
    _td: Dict = Depends(token_dep),
 ):
    if not q:
--- a/docs/examples/full_page_screenshot_and_pdf_export.md
+++ b/docs/examples/full_page_screenshot_and_pdf_export.md
@@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page

 **Simple Example:**
 ```python
-import os, sys
+import os
+import sys
 import asyncio
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig

 # Adjust paths as needed
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -26,9 +27,11 @@ async def main():
        # Request both PDF and screenshot
        result = await crawler.arun(
            url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                pdf=True,
+                screenshot=True
+            )
        )
        
        if result.success:
@@ -40,9 +43,8 @@ async def main():
            
            # Save PDF
            if result.pdf:
-                pdf_bytes = b64decode(result.pdf)
                with open(os.path.join(__location__, "page.pdf"), "wb") as f:
-                    f.write(pdf_bytes)
+                    f.write(result.pdf)

 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -232,6 +232,7 @@ async def main():

 if __name__ == "__main__":
    asyncio.run(main())
+```

 ## 2.4 Compliance & Ethics

--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -36,8 +36,6 @@ class BrowserConfig:

 ### Key Fields to Note

-
-
 1. **`browser_type`**  
 - Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
 - Defaults to `"chromium"`.  
@@ -215,6 +213,7 @@ class CrawlerRunConfig:
    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
    - Affects how much information is printed during the crawl.

+
 ### Helper Methods

 The `clone()` method is particularly useful for creating variations of your crawler configuration:
@@ -248,9 +247,6 @@ The `clone()` method:
 ---


-
-
-
 ## 3. LLMConfig Essentials

 ### Key fields to note
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -2,7 +2,7 @@

 In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:

-1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).  
+1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).  
 2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
 3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.

@@ -18,13 +18,19 @@ In some cases, you need to extract **complex or unstructured** information from

 ---

-## 2. Provider-Agnostic via LightLLM
+## 2. Provider-Agnostic via LiteLLM

-Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
+
+```python
+llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:

 - **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
 - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
- **`api_base`** (optional): If your provider has a custom endpoint.  
+- **`base_url`** (optional): If your provider has a custom endpoint.  

 This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.

@@ -52,20 +58,19 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic

 Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.

-1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
-2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
-3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
-4. **`extraction_type`** (str): `"schema"` or `"block"`.  
-5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
-6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
-7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
-8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
-9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
+1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.    
+2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
+3. **`extraction_type`** (str): `"schema"` or `"block"`.  
+4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
+5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
+6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
+7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
+8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
   - `"markdown"`: The raw markdown (default).  
   - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
   - `"html"`: The cleaned or raw HTML.  
-10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
-11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
+9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
+10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  

 **Example**:

@@ -233,8 +238,7 @@ class KnowledgeGraph(BaseModel):
 async def main():
    # LLM extraction strategy
    llm_strat = LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token=os.getenv('OPENAI_API_KEY'),
+        llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
        schema=KnowledgeGraph.schema_json(),
        extraction_type="schema",
        instruction="Extract entities and relationships from the content. Return valid JSON.",
@@ -286,7 +290,7 @@ if __name__ == "__main__":

 ## 11. Conclusion

-**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
+**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:

 - Put your LLM strategy **in `CrawlerRunConfig`**.  
 - Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.  
@@ -317,4 +321,4 @@ If your site’s data is consistent or repetitive, consider [`JsonCssExtractionS

 ---

-That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
+That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,8 +40,9 @@ dependencies = [
    "fake-useragent>=2.0.3",
    "click>=8.1.7",
    "pyperclip>=1.8.2",
-    "faust-cchardet>=2.1.19",
+    "chardet>=5.2.0",
    "aiohttp>=3.11.11",
+    "brotli>=1.1.0",
    "humanize>=4.10.0",
 ]
 classifiers = [
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ psutil>=6.1.1
 nltk>=3.9.1
 rich>=13.9.4
 cssselect>=1.2.0
-faust-cchardet>=2.1.19
+chardet>=5.2.0
+brotli>=1.1.0