feat: update version to 0.3.742

feat: add support for arm64 platform in Docker commands and update INSTALL_TYPE variable in docker-compose
chore: remove Railway deployment configuration and related documentation
2024-11-24 19:36:30 +08:00 · 2024-11-24 19:35:53 +08:00 · 2024-11-24 18:48:39 +08:00 · 2024-11-23 19:45:41 +08:00 · 2024-11-23 18:00:32 +08:00 · 2024-11-22 20:14:58 +08:00
22 changed files with 285 additions and 3021 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -211,5 +211,6 @@ git_issues.md
 .docs/
 .issues/
 .gitboss/
-
-manage-collab.sh 
+todo_executor.md
+protect-all-except-feature.sh
+manage-collab.sh
--- a/README.md
+++ b/README.md
@@ -142,6 +142,9 @@ docker pull unclecode/crawl4ai:gpu      # GPU-enabled version
 # Run the container
 docker run -p 11235:11235 unclecode/crawl4ai:basic  # Replace 'basic' with your chosen version

+# In case you want to set platform to arm64
+docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic
+
 # In case to allocate more shared memory for the container
 docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic
 ```
@@ -158,6 +161,12 @@ docker build -t crawl4ai:local \
  --build-arg INSTALL_TYPE=basic \  # Options: basic, all
  .

+# In case you want to set platform to arm64
+docker build -t crawl4ai:local \
+  --build-arg INSTALL_TYPE=basic \  # Options: basic, all
+  --platform linux/arm64 \
+  .
+
 # Run your local build
 docker run -p 11235:11235 crawl4ai:local
 ```
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.3.74"
+__version__ = "0.3.742"
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -477,7 +477,7 @@ class AsyncWebCrawler:
                word_count_threshold=word_count_threshold,
                css_selector=css_selector,
                only_text=kwargs.pop("only_text", False),
-                image_description_min_word_threshold=kwargs.get(
+                image_description_min_word_threshold=kwargs.pop(
                    "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
                ),
                content_filter = content_filter,
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -10,6 +10,13 @@ from abc import ABC, abstractmethod

 from snowballstemmer import stemmer

+
+# import regex
+# def tokenize_text(text):
+#     # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters
+#     pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]'
+#     return regex.findall(pattern, text)
+
 # from nltk.stem import PorterStemmer
 # ps = PorterStemmer()
 class RelevantContentFilter(ABC):
@@ -57,9 +64,14 @@ class RelevantContentFilter(ABC):
        query_parts = []
        
        # Title
-        if soup.title:
-            query_parts.append(soup.title.string)
-        elif soup.find('h1'):
+        try:
+            title = soup.title.string
+            if title:
+                query_parts.append(title)
+        except Exception:
+            pass
+
+        if soup.find('h1'):
            query_parts.append(soup.find('h1').get_text())
            
        # Meta tags
@@ -81,7 +93,7 @@ class RelevantContentFilter(ABC):
        return ' '.join(filter(None, query_parts))


-    def extract_text_chunks(self, body: Tag) -> List[Tuple[str, str]]:
+    def extract_text_chunks(self, body: Tag, min_word_threshold: int = None) -> List[Tuple[str, str]]:
        """
        Extracts text chunks from a BeautifulSoup body element while preserving order.
        Returns list of tuples (text, tag_name) for classification.
@@ -155,6 +167,9 @@ class RelevantContentFilter(ABC):
            if text:
                chunks.append((chunk_index, text, 'content', body))
        
+        if min_word_threshold:
+            chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
+        
        return chunks    
    

@@ -274,15 +289,26 @@ class BM25ContentFilter(RelevantContentFilter):
        }
        self.stemmer = stemmer(language)

-    def filter_content(self, html: str) -> List[str]:
+    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
        """Implements content filtering using BM25 algorithm with priority tag handling"""
        if not html or not isinstance(html, str):
            return []

        soup = BeautifulSoup(html, 'lxml')
+        
+        # Check if body is present
+        if not soup.body:
+            # Wrap in body tag if missing
+            soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')        
        body = soup.find('body')
-        query = self.extract_page_query(soup.find('head'), body)
-        candidates = self.extract_text_chunks(body)
+        
+        query = self.extract_page_query(soup, body)
+        
+        if not query:
+            return []
+            # return [self.clean_element(soup)]
+            
+        candidates = self.extract_text_chunks(body, min_word_threshold)

        if not candidates:
            return []
@@ -299,6 +325,10 @@ class BM25ContentFilter(RelevantContentFilter):
                   for _, chunk, _, _ in candidates]
        tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]

+        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())] 
+        #            for _, chunk, _, _ in candidates]
+        # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
+
        # Clean from stop words and noise
        tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
        tokenized_query = clean_tokens(tokenized_query)
@@ -326,3 +356,147 @@ class BM25ContentFilter(RelevantContentFilter):
        selected_candidates.sort(key=lambda x: x[0])

        return [self.clean_element(tag) for _, _, tag in selected_candidates]
+
+
+class HeuristicContentFilter(RelevantContentFilter):
+    def __init__(self):
+        super().__init__()
+        # Weights for different heuristics
+        self.tag_weights = {
+            'article': 10,
+            'main': 8,
+            'section': 5,
+            'div': 3,
+            'p': 2,
+            'pre': 2,
+            'code': 2,
+            'blockquote': 2,
+            'li': 1,
+            'span': 1,
+        }
+        self.max_depth = 5  # Maximum depth from body to consider
+
+    def filter_content(self, html: str) -> List[str]:
+        """Implements heuristic content filtering without relying on a query."""
+        if not html or not isinstance(html, str):
+            return []
+
+        soup = BeautifulSoup(html, 'lxml')
+
+        # Ensure there is a body tag
+        if not soup.body:
+            soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
+        body = soup.body
+
+        # Extract candidate text chunks
+        candidates = self.extract_text_chunks(body)
+
+        if not candidates:
+            return []
+
+        # Score each candidate
+        scored_candidates = []
+        for index, text, tag_type, tag in candidates:
+            score = self.score_element(tag, text)
+            if score > 0:
+                scored_candidates.append((score, index, text, tag))
+
+        # Sort candidates by score and then by document order
+        scored_candidates.sort(key=lambda x: (-x[0], x[1]))
+
+        # Extract the top candidates (e.g., top 5)
+        top_candidates = scored_candidates[:5]  # Adjust the number as needed
+
+        # Sort the top candidates back to their original document order
+        top_candidates.sort(key=lambda x: x[1])
+
+        # Clean and return the content
+        return [self.clean_element(tag) for _, _, _, tag in top_candidates]
+
+    def score_element(self, tag: Tag, text: str) -> float:
+        """Compute a score for an element based on heuristics."""
+        if not text or not tag:
+            return 0
+
+        # Exclude unwanted tags
+        if self.is_excluded(tag):
+            return 0
+
+        # Text density
+        text_length = len(text.strip())
+        html_length = len(str(tag))
+        text_density = text_length / html_length if html_length > 0 else 0
+
+        # Link density
+        link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a'))
+        link_density = link_text_length / text_length if text_length > 0 else 0
+
+        # Tag weight
+        tag_weight = self.tag_weights.get(tag.name, 1)
+
+        # Depth factor (prefer elements closer to the body tag)
+        depth = self.get_depth(tag)
+        depth_weight = max(self.max_depth - depth, 1) / self.max_depth
+
+        # Compute the final score
+        score = (text_density * tag_weight * depth_weight) / (1 + link_density)
+
+        return score
+
+    def get_depth(self, tag: Tag) -> int:
+        """Compute the depth of the tag from the body tag."""
+        depth = 0
+        current = tag
+        while current and current != current.parent and current.name != 'body':
+            current = current.parent
+            depth += 1
+        return depth
+
+    def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]:
+        """
+        Extracts text chunks from the body element while preserving order.
+        Returns list of tuples (index, text, tag_type, tag) for scoring.
+        """
+        chunks = []
+        index = 0
+
+        def traverse(element):
+            nonlocal index
+            if isinstance(element, NavigableString):
+                return
+            if not isinstance(element, Tag):
+                return
+            if self.is_excluded(element):
+                return
+            # Only consider included tags
+            if element.name in self.included_tags:
+                text = element.get_text(separator=' ', strip=True)
+                if len(text.split()) >= self.min_word_count:
+                    tag_type = 'header' if element.name in self.header_tags else 'content'
+                    chunks.append((index, text, tag_type, element))
+                    index += 1
+                    # Do not traverse children of this element to prevent duplication
+                    return
+            for child in element.children:
+                traverse(child)
+
+        traverse(body)
+        return chunks
+
+    def is_excluded(self, tag: Tag) -> bool:
+        """Determine if a tag should be excluded based on heuristics."""
+        if tag.name in self.excluded_tags:
+            return True
+        class_id = ' '.join(filter(None, [
+            ' '.join(tag.get('class', [])),
+            tag.get('id', '')
+        ]))
+        if self.negative_patterns.search(class_id):
+            return True
+        # Exclude tags with high link density (e.g., navigation menus)
+        text = tag.get_text(separator=' ', strip=True)
+        link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a'))
+        text_length = len(text)
+        if text_length > 0 and (link_text_length / text_length) > 0.5:
+            return True
+        return False
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment
 from urllib.parse import urljoin
 from requests.exceptions import InvalidSchema
 # from .content_cleaning_strategy import ContentCleaningStrategy
-from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
+from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
 from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerationStrategy
 from .models import MarkdownGenerationResult
 from .utils import (
@@ -129,6 +129,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                    params={"error": str(e)}
                )
                markdown_generator = None
+                return {
+                    'markdown': f"Error using new markdown generation strategy: {str(e)}",
+                    'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
+                    'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
+                    'markdown_v2': None                    
+                }

        # Legacy method
        h = CustomHTML2Text()
@@ -228,24 +234,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                return None

        def process_image_old(img, url, index, total_images):
-            def parse_srcset(srcset_str):
-                """Parse srcset attribute into list of image URLs with their sizes."""
-                if not srcset_str:
-                    return []
-                
-                sources = []
-                # Split on http/https and filter empty strings
-                urls = [f"http{part}" for part in srcset_str.split("http") if part]
-                
-                for url in urls:
-                    # Remove trailing comma and whitespace, then split to get width
-                    url = url.strip().rstrip(',')
-                    parts = url.rsplit(' ', 1)
-                    img_url = parts[0].strip()
-                    width = parts[1].rstrip('w') if len(parts) > 1 else None
-                    sources.append({'url': img_url, 'width': width})
-                
-                return sources          
+                   
            
            #Check if an image has valid display and inside undesired html elements
            def is_valid_image(img, parent, parent_classes):
@@ -376,12 +365,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            unique_urls = set()
            image_variants = []
            
+            # Generate a unique group ID for this set of variants
+            group_id = index 
+            
            # Base image info template
            base_info = {
                'alt': alt,
                'desc': find_closest_parent_with_useful_text(img),
                'score': score,
-                'type': 'image'
+                'type': 'image',
+                'group_id': group_id # Group ID for this set of variants
            }

            # Inline function for adding variants
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -283,7 +283,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                print(f"[LOG] ✅ Crawled {url} successfully!")
            
            return html
-        except InvalidArgumentException:
+        except InvalidArgumentException as e:
            if not hasattr(e, 'msg'):
                e.msg = sanitize_input_encode(str(e))
            raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
--- a/deploy/railway/README.md
+++ b/deploy/railway/README.md
@@ -1,19 +0,0 @@
-# Railway Deployment
-
-## Quick Deploy
-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/template/crawl4ai)
-
-## Manual Setup
-1. Fork this repository
-2. Create a new Railway project 
-3. Configure environment variables:
-   - `INSTALL_TYPE`: basic or all
-   - `ENABLE_GPU`: true/false
-4. Deploy!
-
-## Configuration
-See `railway.toml` for:
- Memory limits
- Health checks
- Restart policies
- Scaling options
--- a/deploy/railway/button.json
+++ b/deploy/railway/button.json
@@ -1,33 +0,0 @@
-{
-    "name": "Crawl4AI",
-    "description": "LLM Friendly Web Crawler & Scraper",
-    "render": {
-      "dockerfile": {
-        "path": "Dockerfile"
-      }
-    },
-    "env": [
-      {
-        "key": "INSTALL_TYPE",
-        "description": "Installation type (basic/all)",
-        "default": "basic",
-        "required": true
-      },
-      {
-        "key": "ENABLE_GPU",
-        "description": "Enable GPU support",
-        "default": "false",
-        "required": false
-      }
-    ],
-    "services": [
-      {
-        "name": "web",
-        "dockerfile": "./Dockerfile",
-        "healthcheck": {
-          "path": "/health",
-          "port": 11235
-        }
-      }
-    ]
-  }
--- a/deploy/railway/railway.toml
+++ b/deploy/railway/railway.toml
@@ -1,18 +0,0 @@
-# railway.toml
-[build]
-builder = "DOCKERFILE"
-dockerfilePath = "Dockerfile"
-
-[deploy]
-startCommand = "uvicorn main:app --host 0.0.0.0 --port $PORT"
-healthcheckPath = "/health"
-restartPolicyType = "ON_FAILURE"
-restartPolicyMaxRetries = 3
-
-[deploy.memory]
-soft = 2048 # 2GB min for Playwright
-hard = 4096 # 4GB max
-
-[deploy.scaling]
-min = 1
-max = 1
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -4,8 +4,8 @@ services:
      context: .
      dockerfile: Dockerfile
      args:
-        PYTHON_VERSION: 3.10
-        INSTALL_TYPE: all
+        PYTHON_VERSION: "3.10"
+        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
        ENABLE_GPU: false
    profiles: ["local"]
    ports:
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -52,34 +52,7 @@ async def download_example():
        else:
            print("\nNo files were downloaded")

-# 2. Content Filtering with BM25 Example
-async def content_filtering_example():
-    """Example of using the new BM25 content filtering"""
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        # Create filter with custom query for OpenAI's blog
-        content_filter = BM25ContentFilter(
-            # user_query="Investment and fundraising",
-            # user_query="Robotic",
-            bm25_threshold=1.0
-        )
-        
-        result = await crawler.arun(
-            url="https://techcrunch.com/",
-            content_filter=content_filter,
-            cache_mode=CacheMode.BYPASS
-        )
-        
-        print(f"Filtered content: {len(result.fit_markdown)}")
-        print(f"Filtered content: {result.fit_markdown}")
-        
-        # Save html 
-        with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
-            f.write(result.fit_html)
-        
-        with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
-            f.write(result.fit_markdown)
-
-# 3. Local File and Raw HTML Processing Example
+# 2. Local File and Raw HTML Processing Example
 async def local_and_raw_html_example():
    """Example of processing local files and raw HTML"""
    # Create a sample HTML file
@@ -115,6 +88,68 @@ async def local_and_raw_html_example():
        print("Local file content:", local_result.markdown)
        print("\nRaw HTML content:", raw_result.markdown)

+# 3. Enhanced Markdown Generation Example
+async def markdown_generation_example():
+    """Example of enhanced markdown generation with citations and LLM-friendly features"""
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # Create a content filter (optional)
+        content_filter = BM25ContentFilter(
+            # user_query="History and cultivation",
+            bm25_threshold=1.0
+        )
+        
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            css_selector="main div#bodyContent",
+            content_filter=content_filter,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        from crawl4ai import AsyncWebCrawler
+        from crawl4ai.content_filter_strategy import BM25ContentFilter
+        
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            css_selector="main div#bodyContent",
+            content_filter=BM25ContentFilter()
+        )
+        print(result.markdown_v2.fit_markdown)
+        
+        print("\nMarkdown Generation Results:")
+        print(f"1. Original markdown length: {len(result.markdown)}")
+        print(f"2. New markdown versions (markdown_v2):")
+        print(f"   - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
+        print(f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
+        print(f"   - References section length: {len(result.markdown_v2.references_markdown)}")
+        if result.markdown_v2.fit_markdown:
+            print(f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
+        
+        # Save examples to files
+        output_dir = os.path.join(__data__, "markdown_examples")
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Save different versions
+        with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
+            f.write(result.markdown_v2.raw_markdown)
+            
+        with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
+            f.write(result.markdown_v2.markdown_with_citations)
+            
+        with open(os.path.join(output_dir, "3_references.md"), "w") as f:
+            f.write(result.markdown_v2.references_markdown)
+            
+        if result.markdown_v2.fit_markdown:
+            with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
+                f.write(result.markdown_v2.fit_markdown)
+                
+        print(f"\nMarkdown examples saved to: {output_dir}")
+        
+        # Show a sample of citations and references
+        print("\nSample of markdown with citations:")
+        print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
+        print("Sample of references:")
+        print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
+
 # 4. Browser Management Example
 async def browser_management_example():
    """Example of using enhanced browser management features"""
@@ -208,9 +243,13 @@ async def api_example():
                    headers=headers
                ) as status_response:
                    result = await status_response.json()
-                    print(f"Task result: {result}")
+                    print(f"Task status: {result['status']}")
                    
                    if result["status"] == "completed":
+                        print("Task completed!")
+                        print("Results:")
+                        news = json.loads(result["results"][0]['extracted_content'])
+                        print(json.dumps(news[:4], indent=2))
                        break
                    else:
                        await asyncio.sleep(1)
@@ -220,15 +259,15 @@ async def main():
    # print("Running Crawl4AI feature examples...")
    
    # print("\n1. Running Download Example:")
-    await download_example()
+    # await download_example()
    
-    # print("\n2. Running Content Filtering Example:")
-    await content_filtering_example()
+    # print("\n2. Running Markdown Generation Example:")
+    # await markdown_generation_example()
    
-    # print("\n3. Running Local and Raw HTML Example:")
-    await local_and_raw_html_example()
+    # # print("\n3. Running Local and Raw HTML Example:")
+    # await local_and_raw_html_example()
    
-    # print("\n4. Running Browser Management Example:")
+    # # print("\n4. Running Browser Management Example:")
    await browser_management_example()
    
    # print("\n5. Running API Example:")
--- a/pages/app.css
+++ b/pages/app.css
@@ -1,131 +0,0 @@
-:root {
-    --ifm-font-size-base: 100%;
-    --ifm-line-height-base: 1.65;
-    --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, sans-serif,
-        BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji",
-        "Segoe UI Symbol";
-}
-html {
-    -webkit-font-smoothing: antialiased;
-    -webkit-text-size-adjust: 100%;
-    text-size-adjust: 100%;
-    font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
-}
-body {
-    background-color: #1a202c;
-    color: #fff;
-}
-.tab-content {
-    max-height: 400px;
-    overflow: auto;
-}
-pre {
-    white-space: pre-wrap;
-    font-size: 14px;
-}
-pre code {
-    width: 100%;
-}
-
-/* Custom styling for docs-item class and Markdown generated elements */
-.docs-item {
-    background-color: #2d3748; /* bg-gray-800 */
-    padding: 1rem; /* p-4 */
-    border-radius: 0.375rem; /* rounded */
-    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
-    margin-bottom: 1rem; /* space between items */
-    line-height: 1.5; /* leading-normal */
-}
-
-.docs-item h3,
-.docs-item h4 {
-    color: #ffffff; /* text-white */
-    font-size: 1.25rem; /* text-xl */
-    font-weight: 700; /* font-bold */
-    margin-bottom: 0.5rem; /* mb-2 */
-}
-.docs-item h4 {
-    font-size: 1rem; /* text-xl */
-}
-
-.docs-item p {
-    color: #e2e8f0; /* text-gray-300 */
-    margin-bottom: 0.5rem; /* mb-2 */
-}
-
-.docs-item code {
-    background-color: #1a202c; /* bg-gray-900 */
-    color: #e2e8f0; /* text-gray-300 */
-    padding: 0.25rem 0.5rem; /* px-2 py-1 */
-    border-radius: 0.25rem; /* rounded */
-    font-size: 0.875rem; /* text-sm */
-}
-
-.docs-item pre {
-    background-color: #1a202c; /* bg-gray-900 */
-    color: #e2e8f0; /* text-gray-300 */
-    padding: 0.5rem; /* p-2 */
-    border-radius: 0.375rem; /* rounded */
-    overflow: auto; /* overflow-auto */
-    margin-bottom: 0.5rem; /* mb-2 */
-}
-
-.docs-item div {
-    color: #e2e8f0; /* text-gray-300 */
-    font-size: 1rem; /* prose prose-sm */
-    line-height: 1.25rem; /* line-height for readability */
-}
-
-/* Adjustments to make prose class more suitable for dark mode */
-.prose {
-    max-width: none; /* max-w-none */
-}
-
-.prose p,
-.prose ul {
-    margin-bottom: 1rem; /* mb-4 */
-}
-
-.prose code {
-    /* background-color: #4a5568; */ /* bg-gray-700 */
-    color: #65a30d; /* text-white */
-    padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
-    border-radius: 0.25rem; /* rounded */
-    display: inline-block; /* inline-block */
-}
-
-.prose pre {
-    background-color: #1a202c; /* bg-gray-900 */
-    color: #ffffff; /* text-white */
-    padding: 0.5rem; /* p-2 */
-    border-radius: 0.375rem; /* rounded */
-}
-
-.prose h3 {
-    color: #65a30d; /* text-white */
-    font-size: 1.25rem; /* text-xl */
-    font-weight: 700; /* font-bold */
-    margin-bottom: 0.5rem; /* mb-2 */
-}
-
-body {
-    background-color: #1a1a1a;
-    color: #b3ff00;
-}
-.sidebar {
-    color: #b3ff00;
-    border-right: 1px solid #333;
-}
-.sidebar a {
-    color: #b3ff00;
-    text-decoration: none;
-}
-.sidebar a:hover {
-    background-color: #555;
-}
-.content-section {
-    display: none;
-}
-.content-section.active {
-    display: block;
-}
--- a/pages/app.js
+++ b/pages/app.js
@@ -1,356 +0,0 @@
-// JavaScript to manage dynamic form changes and logic
-document.getElementById("extraction-strategy-select").addEventListener("change", function () {
-    const strategy = this.value;
-    const providerModelSelect = document.getElementById("provider-model-select");
-    const tokenInput = document.getElementById("token-input");
-    const instruction = document.getElementById("instruction");
-    const semantic_filter = document.getElementById("semantic_filter");
-    const instruction_div = document.getElementById("instruction_div");
-    const semantic_filter_div = document.getElementById("semantic_filter_div");
-    const llm_settings = document.getElementById("llm_settings");
-
-    if (strategy === "LLMExtractionStrategy") {
-        // providerModelSelect.disabled = false;
-        // tokenInput.disabled = false;
-        // semantic_filter.disabled = true;
-        // instruction.disabled = false;
-        llm_settings.classList.remove("hidden");
-        instruction_div.classList.remove("hidden");
-        semantic_filter_div.classList.add("hidden");
-    } else if (strategy === "NoExtractionStrategy") {
-        semantic_filter_div.classList.add("hidden");
-        instruction_div.classList.add("hidden");
-        llm_settings.classList.add("hidden");
-    } else {
-        // providerModelSelect.disabled = true;
-        // tokenInput.disabled = true;
-        // semantic_filter.disabled = false;
-        // instruction.disabled = true;
-        llm_settings.classList.add("hidden");
-        instruction_div.classList.add("hidden");
-        semantic_filter_div.classList.remove("hidden");
-    }
-
-
-});
-
-// Get the selected provider model and token from local storage
-const storedProviderModel = localStorage.getItem("provider_model");
-const storedToken = localStorage.getItem(storedProviderModel);
-
-if (storedProviderModel) {
-    document.getElementById("provider-model-select").value = storedProviderModel;
-}
-
-if (storedToken) {
-    document.getElementById("token-input").value = storedToken;
-}
-
-// Handle provider model dropdown change
-document.getElementById("provider-model-select").addEventListener("change", () => {
-    const selectedProviderModel = document.getElementById("provider-model-select").value;
-    const storedToken = localStorage.getItem(selectedProviderModel);
-
-    if (storedToken) {
-        document.getElementById("token-input").value = storedToken;
-    } else {
-        document.getElementById("token-input").value = "";
-    }
-});
-
-// Fetch total count from the database
-axios
-    .get("/total-count")
-    .then((response) => {
-        document.getElementById("total-count").textContent = response.data.count;
-    })
-    .catch((error) => console.error(error));
-
-// Handle crawl button click
-document.getElementById("crawl-btn").addEventListener("click", () => {
-    // validate input to have both URL and API token
-    // if selected extraction strategy is LLMExtractionStrategy, then API token is required
-    if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") {
-        if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
-            alert("Please enter both URL(s) and API token.");
-            return;
-        }
-    }
-
-    const selectedProviderModel = document.getElementById("provider-model-select").value;
-    const apiToken = document.getElementById("token-input").value;
-    const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
-    const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
-
-    // Save the selected provider model and token to local storage
-    localStorage.setItem("provider_model", selectedProviderModel);
-    localStorage.setItem(selectedProviderModel, apiToken);
-
-    const urlsInput = document.getElementById("url-input").value;
-    const urls = urlsInput.split(",").map((url) => url.trim());
-    const data = {
-        urls: urls,
-        include_raw_html: true,
-        bypass_cache: bypassCache,
-        extract_blocks: extractBlocks,
-        word_count_threshold: parseInt(document.getElementById("threshold").value),
-        extraction_strategy: document.getElementById("extraction-strategy-select").value,
-        extraction_strategy_args: {
-            provider: selectedProviderModel,
-            api_token: apiToken,
-            instruction: document.getElementById("instruction").value,
-            semantic_filter: document.getElementById("semantic_filter").value,
-        },
-        chunking_strategy: document.getElementById("chunking-strategy-select").value,
-        chunking_strategy_args: {},
-        css_selector: document.getElementById("css-selector").value,
-        screenshot: document.getElementById("screenshot-checkbox").checked,
-        // instruction: document.getElementById("instruction").value,
-        // semantic_filter: document.getElementById("semantic_filter").value,
-        verbose: true,
-    };
-
-    // import requests
-
-    // data = {
-    //   "urls": [
-    //     "https://www.nbcnews.com/business"
-    //   ],
-    //   "word_count_threshold": 10,
-    //   "extraction_strategy": "NoExtractionStrategy",
-    // }
-    
-    // response = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally 
-    // print(response.json())
-
-    // save api token to local storage
-    localStorage.setItem("api_token", document.getElementById("token-input").value);
-
-    document.getElementById("loading").classList.remove("hidden");
-    document.getElementById("result").style.visibility = "hidden";
-    document.getElementById("code_help").style.visibility = "hidden";
-
-    axios
-        .post("/crawl", data)
-        .then((response) => {
-            const result = response.data.results[0];
-            const parsedJson = JSON.parse(result.extracted_content);
-            document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
-            document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
-            document.getElementById("markdown-result").textContent = result.markdown;
-            document.getElementById("media-result").textContent = JSON.stringify( result.media, null, 2);
-            if (result.screenshot){
-                const imgElement = document.createElement("img");
-                // Set the src attribute with the base64 data
-                imgElement.src = `data:image/png;base64,${result.screenshot}`;
-                document.getElementById("screenshot-result").innerHTML = "";
-                document.getElementById("screenshot-result").appendChild(imgElement);
-            }
-            
-            // Update code examples dynamically
-            const extractionStrategy = data.extraction_strategy;
-            const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
-
-            // REMOVE API TOKEN FROM CODE EXAMPLES
-            data.extraction_strategy_args.api_token = "your_api_token";
-
-            if (data.extraction_strategy === "NoExtractionStrategy") {
-                delete data.extraction_strategy_args;
-                delete data.extrac_blocks;
-            }
-
-            if (data.chunking_strategy === "RegexChunking") {
-                delete data.chunking_strategy_args;
-            }
-
-            delete data.verbose;
-
-            if (data.css_selector === "") {
-                delete data.css_selector;
-            }
-
-            if (!data.bypass_cache) {
-                delete data.bypass_cache;
-            }
-
-            if (!data.extract_blocks) {
-                delete data.extract_blocks;
-            }
-
-            if (!data.include_raw_html) {
-                delete data.include_raw_html;
-            }
-
-            document.getElementById(
-                "curl-code"
-            ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
-                ...data,
-                api_token: isLLMExtraction ? "your_api_token" : undefined,
-            }, null, 2)}' https://crawl4ai.com/crawl`;
-
-            document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
-                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
-                null,
-                2
-            )}\n\nresponse = requests.post("https://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
-
-            document.getElementById(
-                "nodejs-code"
-            ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
-                { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
-                null,
-                2
-            )};\n\naxios.post("https://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
-
-            document.getElementById(
-                "library-code"
-            ).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n    url='${
-                urls[0]
-            }',\n    word_count_threshold=${data.word_count_threshold},\n    extraction_strategy=${
-                isLLMExtraction
-                    ? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
-                    : extractionStrategy + "()"
-            },\n    chunking_strategy=${data.chunking_strategy}(),\n    bypass_cache=${
-                data.bypass_cache
-            },\n    css_selector="${data.css_selector}"\n)\nprint(result)`;
-
-            // Highlight code syntax
-            hljs.highlightAll();
-
-            // Select JSON tab by default
-            document.querySelector('.tab-btn[data-tab="json"]').click();
-
-            document.getElementById("loading").classList.add("hidden");
-
-            document.getElementById("result").style.visibility = "visible";
-            document.getElementById("code_help").style.visibility = "visible";
-
-            // increment the total count
-            document.getElementById("total-count").textContent =
-                parseInt(document.getElementById("total-count").textContent) + 1;
-        })
-        .catch((error) => {
-            console.error(error);
-            document.getElementById("loading").classList.add("hidden");
-        });
-});
-
-// Handle tab clicks
-document.querySelectorAll(".tab-btn").forEach((btn) => {
-    btn.addEventListener("click", () => {
-        const tab = btn.dataset.tab;
-        document.querySelectorAll(".tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
-        btn.classList.add("bg-lime-700", "text-white");
-        document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
-        document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
-    });
-});
-
-// Handle code tab clicks
-document.querySelectorAll(".code-tab-btn").forEach((btn) => {
-    btn.addEventListener("click", () => {
-        const tab = btn.dataset.tab;
-        document.querySelectorAll(".code-tab-btn").forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
-        btn.classList.add("bg-lime-700", "text-white");
-        document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
-        document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
-    });
-});
-
-// Handle copy to clipboard button clicks
-
-async function copyToClipboard(text) {
-    if (navigator.clipboard && navigator.clipboard.writeText) {
-        return navigator.clipboard.writeText(text);
-    } else {
-        return fallbackCopyTextToClipboard(text);
-    }
-}
-
-function fallbackCopyTextToClipboard(text) {
-    return new Promise((resolve, reject) => {
-        const textArea = document.createElement("textarea");
-        textArea.value = text;
-
-        // Avoid scrolling to bottom
-        textArea.style.top = "0";
-        textArea.style.left = "0";
-        textArea.style.position = "fixed";
-
-        document.body.appendChild(textArea);
-        textArea.focus();
-        textArea.select();
-
-        try {
-            const successful = document.execCommand("copy");
-            if (successful) {
-                resolve();
-            } else {
-                reject();
-            }
-        } catch (err) {
-            reject(err);
-        }
-
-        document.body.removeChild(textArea);
-    });
-}
-
-document.querySelectorAll(".copy-btn").forEach((btn) => {
-    btn.addEventListener("click", () => {
-        const target = btn.dataset.target;
-        const code = document.getElementById(target).textContent;
-        //navigator.clipboard.writeText(code).then(() => {
-        copyToClipboard(code).then(() => {
-            btn.textContent = "Copied!";
-            setTimeout(() => {
-                btn.textContent = "Copy";
-            }, 2000);
-        });
-    });
-});
-
-document.addEventListener("DOMContentLoaded", async () => {
-    try {
-        const extractionResponse = await fetch("/strategies/extraction");
-        const extractionStrategies = await extractionResponse.json();
-
-        const chunkingResponse = await fetch("/strategies/chunking");
-        const chunkingStrategies = await chunkingResponse.json();
-
-        renderStrategies("extraction-strategies", extractionStrategies);
-        renderStrategies("chunking-strategies", chunkingStrategies);
-    } catch (error) {
-        console.error("Error fetching strategies:", error);
-    }
-});
-
-function renderStrategies(containerId, strategies) {
-    const container = document.getElementById(containerId);
-    container.innerHTML = ""; // Clear any existing content
-    strategies = JSON.parse(strategies);
-    Object.entries(strategies).forEach(([strategy, description]) => {
-        const strategyElement = document.createElement("div");
-        strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
-
-        const strategyDescription = document.createElement("div");
-        strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
-        strategyDescription.innerHTML = marked.parse(description);
-
-        strategyElement.appendChild(strategyDescription);
-
-        container.appendChild(strategyElement);
-    });
-}
-document.querySelectorAll(".sidebar a").forEach((link) => {
-    link.addEventListener("click", function (event) {
-        event.preventDefault();
-        document.querySelectorAll(".content-section").forEach((section) => {
-            section.classList.remove("active");
-        });
-        const target = event.target.getAttribute("data-target");
-        document.getElementById(target).classList.add("active");
-    });
-});
-// Highlight code syntax
-hljs.highlightAll();
--- a/pages/index
+++ b/pages/index
@@ -1,971 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-    <head>
-        <meta charset="UTF-8" />
-        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-        <title>Crawl4AI</title>
-
-        <link rel="preconnect" href="https://fonts.googleapis.com" />
-        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
-        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
-
-        <!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
-        <script src="https://cdn.tailwindcss.com"></script>
-        <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
-        <link
-            rel="stylesheet"
-            href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
-        />
-        <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
-
-        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
-        <style>
-            :root {
-                --ifm-font-size-base: 100%;
-                --ifm-line-height-base: 1.65;
-                --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
-                    sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
-                    "Segoe UI Emoji", "Segoe UI Symbol";
-            }
-            html {
-                -webkit-font-smoothing: antialiased;
-                -webkit-text-size-adjust: 100%;
-                text-size-adjust: 100%;
-                font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
-            }
-            body {
-                background-color: #1a202c;
-                color: #fff;
-            }
-            .tab-content {
-                max-height: 400px;
-                overflow: auto;
-            }
-            pre {
-                white-space: pre-wrap;
-                font-size: 14px;
-            }
-            pre code {
-                width: 100%;
-            }
-        </style>
-        <style>
-            /* Custom styling for docs-item class and Markdown generated elements */
-            .docs-item {
-                background-color: #2d3748; /* bg-gray-800 */
-                padding: 1rem; /* p-4 */
-                border-radius: 0.375rem; /* rounded */
-                box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); /* shadow-md */
-                margin-bottom: 1rem; /* space between items */
-            }
-
-            .docs-item h3,
-            .docs-item h4 {
-                color: #ffffff; /* text-white */
-                font-size: 1.25rem; /* text-xl */
-                font-weight: 700; /* font-bold */
-                margin-bottom: 0.5rem; /* mb-2 */
-            }
-
-            .docs-item p {
-                color: #e2e8f0; /* text-gray-300 */
-                margin-bottom: 0.5rem; /* mb-2 */
-            }
-
-            .docs-item code {
-                background-color: #1a202c; /* bg-gray-900 */
-                color: #e2e8f0; /* text-gray-300 */
-                padding: 0.25rem 0.5rem; /* px-2 py-1 */
-                border-radius: 0.25rem; /* rounded */
-            }
-
-            .docs-item pre {
-                background-color: #1a202c; /* bg-gray-900 */
-                color: #e2e8f0; /* text-gray-300 */
-                padding: 0.5rem; /* p-2 */
-                border-radius: 0.375rem; /* rounded */
-                overflow: auto; /* overflow-auto */
-                margin-bottom: 0.5rem; /* mb-2 */
-            }
-
-            .docs-item div {
-                color: #e2e8f0; /* text-gray-300 */
-                font-size: 1rem; /* prose prose-sm */
-                line-height: 1.25rem; /* line-height for readability */
-            }
-
-            /* Adjustments to make prose class more suitable for dark mode */
-            .prose {
-                max-width: none; /* max-w-none */
-            }
-
-            .prose p,
-            .prose ul {
-                margin-bottom: 1rem; /* mb-4 */
-            }
-
-            .prose code {
-                /* background-color: #4a5568; */ /* bg-gray-700 */
-                color: #65a30d; /* text-white */
-                padding: 0.25rem 0.5rem; /* px-1 py-0.5 */
-                border-radius: 0.25rem; /* rounded */
-                display: inline-block; /* inline-block */
-            }
-
-            .prose pre {
-                background-color: #1a202c; /* bg-gray-900 */
-                color: #ffffff; /* text-white */
-                padding: 0.5rem; /* p-2 */
-                border-radius: 0.375rem; /* rounded */
-            }
-
-            .prose h3 {
-                color: #65a30d; /* text-white */
-                font-size: 1.25rem; /* text-xl */
-                font-weight: 700; /* font-bold */
-                margin-bottom: 0.5rem; /* mb-2 */
-            }
-        </style>
-    </head>
-    <body class="bg-black text-gray-200">
-        <header class="bg-zinc-950 text-white py-4 flex">
-            <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
-            </div>
-            <div class="mx-auto px-4 flex font-bold text-xl gap-2">
-                <span>📊 Total Website Processed</span>
-                <span id="total-count" class="text-lime-400">2</span>
-            </div>
-        </header>
-
-        <section class="try-it py-8 px-16 pb-20">
-            <div class="container mx-auto px-4">
-                <h2 class="text-2xl font-bold mb-4">Try It Now</h2>
-                <div class="grid grid-cols-1 lg:grid-cols-3 gap-4">
-                    <div class="space-y-4">
-                        <div class="flex flex-col">
-                            <label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
-                            <input
-                                type="text"
-                                id="url-input"
-                                value="https://www.nbcnews.com/business"
-                                class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
-                                placeholder="Enter URL(s) separated by commas"
-                            />
-                        </div>
-                        <div class="flex flex-col">
-                            <label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
-                            <select
-                                id="threshold"
-                                class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
-                            >
-                                <option value="5">5</option>
-                                <option value="10" selected>10</option>
-                                <option value="15">15</option>
-                                <option value="20">20</option>
-                                <option value="25">25</option>
-                            </select>
-                        </div>
-                        <div class="flex flex-col">
-                            <label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
-                            <input
-                                type="text"
-                                id="css-selector"
-                                class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
-                                placeholder="Enter CSS Selector"
-                            />
-                        </div>
-                        <div class="flex flex-col">
-                            <label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
-                                >Extraction Strategy</label
-                            >
-                            <select
-                                id="extraction-strategy-select"
-                                class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
-                            >
-                                <option value="CosineStrategy">CosineStrategy</option>
-                                <option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
-                                <option value="NoExtractionStrategy">NoExtractionStrategy</option>
-                            </select>
-                        </div>
-                        <div class="flex flex-col">
-                            <label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
-                                >Chunking Strategy</label
-                            >
-                            <select
-                                id="chunking-strategy-select"
-                                class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-lime-500"
-                            >
-                                <option value="RegexChunking">RegexChunking</option>
-                                <option value="NlpSentenceChunking">NlpSentenceChunking</option>
-                                <option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
-                                <option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
-                                <option value="SlidingWindowChunking">SlidingWindowChunking</option>
-                            </select>
-                        </div>
-                        <div class="flex flex-col">
-                            <label for="provider-model-select" class="text-lime-500 font-bold text-xs"
-                                >Provider Model</label
-                            >
-                            <select
-                                id="provider-model-select"
-                                class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
-                                disabled
-                            >
-                                <option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
-                                <option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
-                                <option value="openai/gpt-4-turbo">gpt-4-turbo</option>
-                                <option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
-                                <option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
-                                <option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
-                                <option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
-                            </select>
-                        </div>
-                        <div class="flex flex-col">
-                            <label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
-                            <input
-                                type="password"
-                                id="token-input"
-                                class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-lime-500"
-                                placeholder="Enter Groq API token"
-                                disabled
-                            />
-                        </div>
-                        <div class="flex gap-3">
-                            <div class="flex items-center gap-2">
-                                <input type="checkbox" id="bypass-cache-checkbox" />
-                                <label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
-                            </div>
-                            <div class="flex items-center gap-2">
-                                <input type="checkbox" id="extract-blocks-checkbox" checked />
-                                <label for="extract-blocks-checkbox" class="text-lime-500 font-bold"
-                                    >Extract Blocks</label
-                                >
-                            </div>
-                            <button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">
-                                Crawl
-                            </button>
-                        </div>
-                    </div>
-
-                    <div id="result" class=" ">
-                        <div id="loading" class="hidden">
-                            <p class="text-white">Loading... Please wait.</p>
-                        </div>
-                        <div class="tab-buttons flex gap-2">
-                            <button
-                                class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                                data-tab="json"
-                            >
-                                JSON
-                            </button>
-                            <button
-                                class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                                data-tab="cleaned-html"
-                            >
-                                Cleaned HTML
-                            </button>
-                            <button
-                                class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                                data-tab="markdown"
-                            >
-                                Markdown
-                            </button>
-                        </div>
-                        <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
-                            <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
-                            <pre
-                                class="hidden h-full flex"
-                            ><code id="cleaned-html-result" class="language-html"></code></pre>
-                            <pre
-                                class="hidden h-full flex"
-                            ><code id="markdown-result" class="language-markdown"></code></pre>
-                        </div>
-                    </div>
-
-                    <div id="code_help" class=" ">
-                        <div class="tab-buttons flex gap-2">
-                            <button
-                                class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                                data-tab="curl"
-                            >
-                                cURL
-                            </button>
-                            <button
-                                class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                                data-tab="library"
-                            >
-                                Python Library
-                            </button>
-                            <button
-                                class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                                data-tab="python"
-                            >
-                                Python (Request)
-                            </button>
-                            <button
-                                class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                                data-tab="nodejs"
-                            >
-                                Node.js
-                            </button>
-                        </div>
-                        <div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
-                            <pre class="h-full flex relative">
-                                <code id="curl-code" class="language-bash"></code>
-                                <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
-                            </pre>
-                            <pre class="hidden h-full flex relative">
-                                <code id="python-code" class="language-python"></code>
-                                <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
-                            </pre>
-                            <pre class="hidden h-full flex relative">
-                                <code id="nodejs-code" class="language-javascript"></code>
-                                <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
-                            </pre>
-                            <pre class="hidden h-full flex relative">
-                                <code id="library-code" class="language-python"></code>
-                                <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
-                            </pre>
-                        </div>
-                    </div>
-                </div>
-            </div>
-        </section>
-        <section class="bg-zinc-900 text-zinc-300 p-6 px-20">
-            <div class="grid grid-cols-2 gap-4 p-4 bg-zinc-900 text-lime-500">
-                <!-- Step 1 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🌟 <strong>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun!</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">
-                    First Step: Create an instance of WebCrawler and call the <code>warmup()</code> function.
-                </div>
-                <div>
-                    <pre><code class="language-python">crawler = WebCrawler()
-            crawler.warmup()</code></pre>
-                </div>
-
-                <!-- Step 2 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">First crawl (caches the result):</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Second crawl (Force to crawl again):</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)</code></pre>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Crawl result without raw HTML content:</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)</code></pre>
-                </div>
-
-                <!-- Step 3 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    📄
-                    <strong
-                        >The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the
-                        response. By default, it is set to True.</strong
-                    >
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Set <code>always_by_pass_cache</code> to True:</div>
-                <div>
-                    <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
-                </div>
-
-                <!-- Step 4 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Using RegexChunking:</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                chunking_strategy=RegexChunking(patterns=["\n\n"])
-            )</code></pre>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Using NlpSentenceChunking:</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                chunking_strategy=NlpSentenceChunking()
-            )</code></pre>
-                </div>
-
-                <!-- Step 5 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Using CosineStrategy:</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
-            )</code></pre>
-                </div>
-
-                <!-- Step 6 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🤖 <strong>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Using LLMExtractionStrategy without instructions:</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
-            )</code></pre>
-                </div>
-
-                <!-- Step 7 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    📜 <strong>Let's make it even more interesting: LLMExtractionStrategy with instructions!</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Using LLMExtractionStrategy with instructions:</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                extraction_strategy=LLMExtractionStrategy(
-                    provider="openai/gpt-4o",
-                    api_token=os.getenv('OPENAI_API_KEY'),
-                    instruction="I am interested in only financial news"
-                )
-            )</code></pre>
-                </div>
-
-                <!-- Step 8 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🎯 <strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Using CSS selector to extract H2 tags:</div>
-                <div>
-                    <pre><code class="language-python">result = crawler.run(
-                url="https://www.nbcnews.com/business",
-                css_selector="h2"
-            )</code></pre>
-                </div>
-
-                <!-- Step 9 -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🖱️ <strong>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong>
-                </div>
-                <div class="bg-zinc-800 p-2 rounded">Using JavaScript to click 'Load More' button:</div>
-                <div>
-                    <pre><code class="language-python">js_code = """
-            const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-            loadMoreButton && loadMoreButton.click();
-            """
-            crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-            crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-            result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
-                </div>
-
-                <!-- Conclusion -->
-                <div class="col-span-2 bg-yellow-500 p-2 rounded text-zinc-900">
-                    🎉
-                    <strong
-                        >Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl
-                        the web like a pro! 🕸️</strong
-                    >
-                </div>
-            </div>
-        </section>
-        <section class="bg-zinc-900 text-zinc-300 p-6 px-20">
-            <h1 class="text-3xl font-bold mb-4">Installation 💻</h1>
-            <p class="mb-4">
-                There are two ways to use Crawl4AI: as a library in your Python projects or as a standalone local
-                server.
-            </p>
-
-            <p class="mb-4">
-                You can also try Crawl4AI in a Google Colab
-                <a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
-                    ><img
-                        src="https://colab.research.google.com/assets/colab-badge.svg"
-                        alt="Open In Colab"
-                        style="display: inline-block; width: 100px; height: 20px"
-                /></a>
-            </p>
-
-            <h2 class="text-2xl font-bold mb-2">Using Crawl4AI as a Library 📚</h2>
-            <p class="mb-4">To install Crawl4AI as a library, follow these steps:</p>
-
-            <ol class="list-decimal list-inside mb-4">
-                <li class="mb-2">
-                    Install the package from GitHub:
-                    <pre
-                        class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-                    ><code>pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
-                </li>
-                <li class="mb-2">
-                    Alternatively, you can clone the repository and install the package locally:
-                    <pre
-                        class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-                    ><code  class = "language-python bash">virtualenv venv
-source venv/bin/activate
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai
-pip install -e .
-        </code></pre>
-                </li>
-                <li>
-                    Import the necessary modules in your Python script:
-                    <pre
-                        class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-                    ><code class = "language-python hljs">from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-import os
-
-crawler = WebCrawler()
-
-# Single page crawl
-single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
-result = crawl4ai.fetch_page(
-    url='https://www.nbcnews.com/business',
-    word_count_threshold=5, # Minimum word count for a HTML tag to be considered as a worthy block
-    chunking_strategy= RegexChunking( patterns = ["\\n\\n"]), # Default is RegexChunking
-    extraction_strategy= CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3) # Default is CosineStrategy
-    # extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
-    bypass_cache=False,
-    extract_blocks =True, # Whether to extract semantical blocks of text from the HTML
-    css_selector = "", # Eg: "div.article-body"
-    verbose=True,
-    include_raw_html=True, # Whether to include the raw HTML content in the response
-)
-print(result.model_dump())
-        </code></pre>
-                </li>
-            </ol>
-            <p class="mb-4">
-                For more information about how to run Crawl4AI as a local server, please refer to the
-                <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
-            </p>
-            
-        </section>
-
-        <section class="bg-zinc-900 text-zinc-300 p-6 px-20">
-            <h1 class="text-3xl font-bold mb-4">📖 Parameters</h1>
-            <div class="overflow-x-auto">
-                <table class="min-w-full bg-zinc-800 border border-zinc-700">
-                    <thead>
-                        <tr>
-                            <th class="py-2 px-4 border-b border-zinc-700">Parameter</th>
-                            <th class="py-2 px-4 border-b border-zinc-700">Description</th>
-                            <th class="py-2 px-4 border-b border-zinc-700">Required</th>
-                            <th class="py-2 px-4 border-b border-zinc-700">Default Value</th>
-                        </tr>
-                    </thead>
-                    <tbody>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">urls</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                A list of URLs to crawl and extract data from.
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">Yes</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">-</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">include_raw_html</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                Whether to include the raw HTML content in the response.
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">No</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">false</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">bypass_cache</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                Whether to force a fresh crawl even if the URL has been previously crawled.
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">No</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">false</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">extract_blocks</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                Whether to extract semantical blocks of text from the HTML.
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">No</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">true</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">word_count_threshold</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                The minimum number of words a block must contain to be considered meaningful (minimum
-                                value is 5).
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">No</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">5</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">extraction_strategy</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">No</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">CosineStrategy</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">chunking_strategy</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                The strategy to use for chunking the text before processing (e.g., "RegexChunking").
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">No</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">RegexChunking</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4 border-b border-zinc-700">css_selector</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">
-                                The CSS selector to target specific parts of the HTML for extraction.
-                            </td>
-                            <td class="py-2 px-4 border-b border-zinc-700">No</td>
-                            <td class="py-2 px-4 border-b border-zinc-700">None</td>
-                        </tr>
-                        <tr>
-                            <td class="py-2 px-4">verbose</td>
-                            <td class="py-2 px-4">Whether to enable verbose logging.</td>
-                            <td class="py-2 px-4">No</td>
-                            <td class="py-2 px-4">true</td>
-                        </tr>
-                    </tbody>
-                </table>
-            </div>
-        </section>
-
-        <section id="extraction" class="py-8 px-20">
-            <div class="overflow-x-auto mx-auto px-6">
-                <h2 class="text-2xl font-bold mb-4">Extraction Strategies</h2>
-                <div id="extraction-strategies" class="space-y-4"></div>
-            </div>
-        </section>
-
-        <section id="chunking" class="py-8 px-20">
-            <div class="overflow-x-auto mx-auto px-6">
-                <h2 class="text-2xl font-bold mb-4">Chunking Strategies</h2>
-                <div id="chunking-strategies" class="space-y-4"></div>
-            </div>
-        </section>
-
-        <section class="hero bg-zinc-900 py-8 px-20">
-            <div class="container mx-auto px-4">
-                <h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
-                <p class="text-lg mb-4">
-                    In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
-                    for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
-                    crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
-                    🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
-                    definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
-                    philosophy, we invite you to join our "Robinhood" band and help set these products free for the
-                    benefit of all. 🤝💪
-                </p>
-            </div>
-        </section>
-
-        <section class="installation py-8 px-20">
-            <div class="container mx-auto px-4">
-                <h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
-                <p class="mb-4">
-                    To install and run Crawl4AI as a library or a local server, please refer to the 📚
-                    <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
-                </p>
-            </div>
-        </section>
-
-        <footer class="bg-zinc-900 text-white py-4">
-            <div class="container mx-auto px-4">
-                <div class="flex justify-between items-center">
-                    <p>© 2024 Crawl4AI. All rights reserved.</p>
-                    <div class="social-links">
-                        <a
-                            href="https://github.com/unclecode/crawl4ai"
-                            class="text-white hover:text-gray-300 mx-2"
-                            target="_blank"
-                            >😺 GitHub</a
-                        >
-                        <a
-                            href="https://twitter.com/unclecode"
-                            class="text-white hover:text-gray-300 mx-2"
-                            target="_blank"
-                            >🐦 Twitter</a
-                        >
-                    </div>
-                </div>
-            </div>
-        </footer>
-
-        <script>
-            // JavaScript to manage dynamic form changes and logic
-            document.getElementById("extraction-strategy-select").addEventListener("change", function () {
-                const strategy = this.value;
-                const providerModelSelect = document.getElementById("provider-model-select");
-                const tokenInput = document.getElementById("token-input");
-
-                if (strategy === "LLMExtractionStrategy") {
-                    providerModelSelect.disabled = false;
-                    tokenInput.disabled = false;
-                } else {
-                    providerModelSelect.disabled = true;
-                    tokenInput.disabled = true;
-                }
-            });
-
-            // Get the selected provider model and token from local storage
-            const storedProviderModel = localStorage.getItem("provider_model");
-            const storedToken = localStorage.getItem(storedProviderModel);
-
-            if (storedProviderModel) {
-                document.getElementById("provider-model-select").value = storedProviderModel;
-            }
-
-            if (storedToken) {
-                document.getElementById("token-input").value = storedToken;
-            }
-
-            // Handle provider model dropdown change
-            document.getElementById("provider-model-select").addEventListener("change", () => {
-                const selectedProviderModel = document.getElementById("provider-model-select").value;
-                const storedToken = localStorage.getItem(selectedProviderModel);
-
-                if (storedToken) {
-                    document.getElementById("token-input").value = storedToken;
-                } else {
-                    document.getElementById("token-input").value = "";
-                }
-            });
-
-            // Fetch total count from the database
-            axios
-                .get("/total-count")
-                .then((response) => {
-                    document.getElementById("total-count").textContent = response.data.count;
-                })
-                .catch((error) => console.error(error));
-
-            // Handle crawl button click
-            document.getElementById("crawl-btn").addEventListener("click", () => {
-                // validate input to have both URL and API token
-                if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
-                    alert("Please enter both URL(s) and API token.");
-                    return;
-                }
-
-                const selectedProviderModel = document.getElementById("provider-model-select").value;
-                const apiToken = document.getElementById("token-input").value;
-                const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
-                const bypassCache = document.getElementById("bypass-cache-checkbox").checked;
-
-                // Save the selected provider model and token to local storage
-                localStorage.setItem("provider_model", selectedProviderModel);
-                localStorage.setItem(selectedProviderModel, apiToken);
-
-                const urlsInput = document.getElementById("url-input").value;
-                const urls = urlsInput.split(",").map((url) => url.trim());
-                const data = {
-                    urls: urls,
-                    provider_model: selectedProviderModel,
-                    api_token: apiToken,
-                    include_raw_html: true,
-                    bypass_cache: bypassCache,
-                    extract_blocks: extractBlocks,
-                    word_count_threshold: parseInt(document.getElementById("threshold").value),
-                    extraction_strategy: document.getElementById("extraction-strategy-select").value,
-                    chunking_strategy: document.getElementById("chunking-strategy-select").value,
-                    css_selector: document.getElementById("css-selector").value,
-                    verbose: true,
-                };
-
-                // save api token to local storage
-                localStorage.setItem("api_token", document.getElementById("token-input").value);
-
-                document.getElementById("loading").classList.remove("hidden");
-                //document.getElementById("result").classList.add("hidden");
-                //document.getElementById("code_help").classList.add("hidden");
-
-                axios
-                    .post("/crawl", data)
-                    .then((response) => {
-                        const result = response.data.results[0];
-                        const parsedJson = JSON.parse(result.extracted_content);
-                        document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
-                        document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
-                        document.getElementById("markdown-result").textContent = result.markdown;
-
-                        // Update code examples dynamically
-                        const extractionStrategy = data.extraction_strategy;
-                        const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
-
-                        document.getElementById(
-                            "curl-code"
-                        ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
-                            ...data,
-                            api_token: isLLMExtraction ? "your_api_token" : undefined,
-                        })}' http://crawl4ai.uccode.io/crawl`;
-
-                        document.getElementById(
-                            "python-code"
-                        ).textContent = `import requests\n\ndata = ${JSON.stringify(
-                            { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
-                            null,
-                            2
-                        )}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
-
-                        document.getElementById(
-                            "nodejs-code"
-                        ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
-                            { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
-                            null,
-                            2
-                        )};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
-
-                        document.getElementById(
-                            "library-code"
-                        ).textContent = `from crawl4ai.web_crawler import WebCrawler\nfrom crawl4ai.extraction_strategy import *\nfrom crawl4ai.chunking_strategy import *\n\ncrawler = WebCrawler()\ncrawler.warmup()\n\nresult = crawler.run(\n    url='${
-                            urls[0]
-                        }',\n    word_count_threshold=${data.word_count_threshold},\n    extraction_strategy=${
-                            isLLMExtraction
-                                ? `${extractionStrategy}(provider="${data.provider_model}", api_token="${data.api_token}")`
-                                : extractionStrategy + "()"
-                        },\n    chunking_strategy=${data.chunking_strategy}(),\n    bypass_cache=${
-                            data.bypass_cache
-                        },\n    css_selector="${data.css_selector}"\n)\nprint(result)`;
-
-                        // Highlight code syntax
-                        hljs.highlightAll();
-
-                        // Select JSON tab by default
-                        document.querySelector('.tab-btn[data-tab="json"]').click();
-
-                        document.getElementById("loading").classList.add("hidden");
-                        document.getElementById("result").classList.remove("hidden");
-                        document.getElementById("code_help").classList.remove("hidden");
-
-                        // increment the total count
-                        document.getElementById("total-count").textContent =
-                            parseInt(document.getElementById("total-count").textContent) + 1;
-                    })
-                    .catch((error) => {
-                        console.error(error);
-                        document.getElementById("loading").classList.add("hidden");
-                    });
-            });
-
-            // Handle tab clicks
-            document.querySelectorAll(".tab-btn").forEach((btn) => {
-                btn.addEventListener("click", () => {
-                    const tab = btn.dataset.tab;
-                    document
-                        .querySelectorAll(".tab-btn")
-                        .forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
-                    btn.classList.add("bg-lime-700", "text-white");
-                    document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
-                    document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
-                });
-            });
-
-            // Handle code tab clicks
-            document.querySelectorAll(".code-tab-btn").forEach((btn) => {
-                btn.addEventListener("click", () => {
-                    const tab = btn.dataset.tab;
-                    document
-                        .querySelectorAll(".code-tab-btn")
-                        .forEach((b) => b.classList.remove("bg-lime-700", "text-white"));
-                    btn.classList.add("bg-lime-700", "text-white");
-                    document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
-                    document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
-                });
-            });
-
-            // Handle copy to clipboard button clicks
-
-            async function copyToClipboard(text) {
-                if (navigator.clipboard && navigator.clipboard.writeText) {
-                    return navigator.clipboard.writeText(text);
-                } else {
-                    return fallbackCopyTextToClipboard(text);
-                }
-            }
-
-            function fallbackCopyTextToClipboard(text) {
-                return new Promise((resolve, reject) => {
-                    const textArea = document.createElement("textarea");
-                    textArea.value = text;
-
-                    // Avoid scrolling to bottom
-                    textArea.style.top = "0";
-                    textArea.style.left = "0";
-                    textArea.style.position = "fixed";
-
-                    document.body.appendChild(textArea);
-                    textArea.focus();
-                    textArea.select();
-
-                    try {
-                        const successful = document.execCommand("copy");
-                        if (successful) {
-                            resolve();
-                        } else {
-                            reject();
-                        }
-                    } catch (err) {
-                        reject(err);
-                    }
-
-                    document.body.removeChild(textArea);
-                });
-            }
-
-            document.querySelectorAll(".copy-btn").forEach((btn) => {
-                btn.addEventListener("click", () => {
-                    const target = btn.dataset.target;
-                    const code = document.getElementById(target).textContent;
-                    //navigator.clipboard.writeText(code).then(() => {
-                    copyToClipboard(code).then(() => {
-                        btn.textContent = "Copied!";
-                        setTimeout(() => {
-                            btn.textContent = "Copy";
-                        }, 2000);
-                    });
-                });
-            });
-
-            document.addEventListener("DOMContentLoaded", async () => {
-                try {
-                    const extractionResponse = await fetch("/strategies/extraction");
-                    const extractionStrategies = await extractionResponse.json();
-
-                    const chunkingResponse = await fetch("/strategies/chunking");
-                    const chunkingStrategies = await chunkingResponse.json();
-
-                    renderStrategies("extraction-strategies", extractionStrategies);
-                    renderStrategies("chunking-strategies", chunkingStrategies);
-                } catch (error) {
-                    console.error("Error fetching strategies:", error);
-                }
-            });
-
-            function renderStrategies(containerId, strategies) {
-                const container = document.getElementById(containerId);
-                container.innerHTML = ""; // Clear any existing content
-                strategies = JSON.parse(strategies);
-                Object.entries(strategies).forEach(([strategy, description]) => {
-                    const strategyElement = document.createElement("div");
-                    strategyElement.classList.add("bg-zinc-800", "p-4", "rounded", "shadow-md", "docs-item");
-
-                    const strategyDescription = document.createElement("div");
-                    strategyDescription.classList.add("text-gray-300", "prose", "prose-sm");
-                    strategyDescription.innerHTML = marked.parse(description);
-
-                    strategyElement.appendChild(strategyDescription);
-
-                    container.appendChild(strategyElement);
-                });
-            }
-
-            // Highlight code syntax
-            hljs.highlightAll();
-        </script>
-    </body>
-</html>
--- a/pages/index.html
+++ b/pages/index.html
@@ -1,73 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-    <head>
-        <meta charset="UTF-8" />
-        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-        <title>Crawl4AI</title>
-
-        <link rel="preconnect" href="https://fonts.googleapis.com" />
-        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
-        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
-
-        <!-- <link href="https://cdn.jsdelivr.net/npm/tailwindcss@3.4.3/dist/tailwind.min.css" rel="stylesheet" /> -->
-        <script src="https://cdn.tailwindcss.com"></script>
-        <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
-        <link rel="stylesheet" href="/pages/app.css" />
-        <link
-            rel="stylesheet"
-            href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/monokai.min.css"
-        />
-        <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
-
-        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
-    </head>
-    <body class="bg-black text-gray-200">
-        <header class="bg-zinc-950 text-lime-500 py-4 flex">
-            
-            <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts</h1>
-            </div>
-            <div class="mx-auto px-4 flex font-bold text-xl gap-2">
-                <span>📊 Total Website Processed</span>
-                <span id="total-count" class="text-lime-400">2</span>
-            </div>
-        </header>
-        
-            {{ try_it | safe }}
-
-            <div class="mx-auto p-4 bg-zinc-950 text-lime-500 min-h-screen">
-                <div class="container mx-auto">
-                <div class="flex h-full px-20">
-                    <div class="sidebar w-1/4 p-4">
-                        <h2 class="text-lg font-bold mb-4">Outline</h2>
-                        <ul>
-                            <li class="mb-2"><a href="#" data-target="installation">Installation</a></li>
-                            <li class="mb-2"><a href="#" data-target="how-to-guide">How to Guide</a></li>
-                            <li class="mb-2"><a href="#" data-target="chunking-strategies">Chunking Strategies</a></li>
-                            <li class="mb-2">
-                                <a href="#" data-target="extraction-strategies">Extraction Strategies</a>
-                            </li>
-                        </ul>
-                    </div>
-
-                    <!-- Main Content -->
-                    <div class="w-3/4 p-4">
-                        {{installation | safe}} {{how_to_guide | safe}}
-
-                        <section id="chunking-strategies" class="content-section">
-                            <h1 class="text-2xl font-bold">Chunking Strategies</h1>
-                            <p>Content for chunking strategies...</p>
-                        </section>
-                        <section id="extraction-strategies" class="content-section">
-                            <h1 class="text-2xl font-bold">Extraction Strategies</h1>
-                            <p>Content for extraction strategies...</p>
-                        </section>
-                    </div>
-                </div>
-            </div>
-            </div>
-        
-        {{ footer | safe }}
-        <script script src="/pages/app.js"></script>
-    </body>
-</html>
--- a/pages/index_pooling.html
+++ b/pages/index_pooling.html
@@ -1,425 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-    <head>
-        <meta charset="UTF-8" />
-        <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-        <title>Crawl4AI</title>
-
-        <link rel="preconnect" href="https://fonts.googleapis.com" />
-        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
-        <link href="https://fonts.googleapis.com/css2?family=Inter:wght@100..900&display=swap" rel="stylesheet" />
-
-        <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet" />
-        <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
-        <link
-            rel="stylesheet"
-            href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/styles/vs2015.min.css"
-        />
-        <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.7.0/highlight.min.js"></script>
-        <style>
-            :root {
-                --ifm-font-size-base: 100%;
-                --ifm-line-height-base: 1.65;
-                --ifm-font-family-base: system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans,
-                    sans-serif, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif, "Apple Color Emoji",
-                    "Segoe UI Emoji", "Segoe UI Symbol";
-            }
-            html {
-                -webkit-font-smoothing: antialiased;
-                -webkit-text-size-adjust: 100%;
-                text-size-adjust: 100%;
-                font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base);
-            }
-            body {
-                background-color: #1a202c;
-                color: #fff;
-            }
-            .tab-content {
-                max-height: 400px;
-                overflow: auto;
-            }
-            pre {
-                white-space: pre-wrap;
-                font-size: 14px;
-            }
-            pre code {
-                width: 100%;
-            }
-        </style>
-    </head>
-    <body>
-        <header class="bg-gray-900 text-white py-4">
-            <div class="container mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Open-source LLM Friendly Web scraper</h1>
-            </div>
-        </header>
-
-        <section class="try-it py-8 pb-20">
-            <div class="container mx-auto px-4">
-                <h2 class="text-2xl font-bold mb-4">Try It Now</h2>
-                <div class="mb-4 flex w-full gap-2">
-                    <input
-                        type="text"
-                        id="url-input"
-                        value="https://kidocode.com"
-                        class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
-                        placeholder="Enter URL(s) separated by commas"
-                    />
-                    <select
-                        id="provider-model-select"
-                        class="border border-gray-600 rounded px-4 py-2 bg-gray-800 text-white"
-                    >
-                        <!-- Add your option values here -->
-                        <option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
-                        <option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
-                        <option value="openai/gpt-4-turbo">gpt-4-turbo</option>
-                        <option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
-                        <option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
-                        <option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
-                        <option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
-                    </select>
-                    <input
-                        type="password"
-                        id="token-input"
-                        class="border border-gray-600 rounded px-4 py-2 flex-grow bg-gray-800 text-white"
-                        placeholder="Enter Groq API token"
-                    />
-                    <div class="flex items-center justify-center">
-                        <input type="checkbox" id="extract-blocks-checkbox" class="mr-2" checked>
-                        <label for="extract-blocks-checkbox" class="text-white">Extract Blocks</label>
-                    </div>
-                    <button id="crawl-btn" class="bg-blue-600 text-white px-4 py-2 rounded">Crawl</button>
-                </div>
-                <div class="grid grid-cols-1 md:grid-cols-2 gap-8">
-                    <div id="loading" class="hidden mt-4">
-                        <p>Loading...</p>
-                    </div>
-                    <div id="result" class="tab-container flex-1 h-full flex-col">
-                        <div class="tab-buttons flex gap-2">
-                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="json">JSON</button>
-                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="cleaned-html">
-                                Cleaned HTML
-                            </button>
-                            <button class="tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="markdown">
-                                Markdown
-                            </button>
-                        </div>
-                        <div class="tab-content code bg-gray-800 p-2 rounded  h-full  flex-1 border border-gray-600">
-                            <pre class="h-full flex"><code id="json-result" class="language-json "></code></pre>
-                            <pre
-                                class="hidden h-full flex"
-                            ><code id="cleaned-html-result" class="language-html "></code></pre>
-                            <pre
-                                class="hidden h-full flex"
-                            ><code id="markdown-result" class="language-markdown "></code></pre>
-                        </div>
-                    </div>
-                    <div id="code_help" class="tab-container flex-1 h-full">
-                        <div class="tab-buttons flex gap-2">
-                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="curl">cURL</button>
-                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="python">
-                                Python
-                            </button>
-                            <button class="code-tab-btn px-4 py-2 bg-gray-700 rounded-t" data-tab="nodejs">
-                                Node.js
-                            </button>
-                        </div>
-                        <div class="tab-content result bg-gray-800 p-2 rounded h-full  flex-1 border border-gray-600">
-                            <pre class="h-full flex relative">
-                                    <code id="curl-code" class="language-bash"></code>
-                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
-                                </pre>
-                            <pre class="hidden h-full flex relative">
-                                    <code id="python-code" class="language-python"></code>
-                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
-                                </pre>
-                            <pre class="hidden h-full flex relative">
-                                    <code id="nodejs-code" class="language-javascript"></code>
-                                    <button class="absolute top-2 right-2 bg-gray-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
-                                </pre>
-                        </div>
-                    </div>
-                </div>
-            </div>
-        </section>
-
-        <section class="hero bg-gray-900 py-8">
-            <div class="container mx-auto px-4">
-                <h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
-                <p class="text-lg mb-4">
-                    In recent times, we've seen numerous startups emerging, riding the AI hype wave and charging for
-                    services that should rightfully be accessible to everyone. 🌍💸 One for example is to scrap and crawl 
-                    a web page, and transform it o a form suitable for LLM. We don't think one should build a business
-                    out of this, but definilty should be opened source. So if you possess the skills to build such things 
-                    and you have such philosphy you should join our "Robinhood" band and help set
-                    these products free. 🆓🤝
-                </p>
-            </div>
-        </section>
-
-        <section class="installation py-8">
-            <div class="container mx-auto px-4">
-                <h2 class="text-2xl font-bold mb-4">⚙️ Installation</h2>
-                <p class="mb-4">
-                    To install and run Crawl4AI locally or on your own service, the best way is to use Docker. 🐳 Follow
-                    these steps:
-                </p>
-                <ol class="list-decimal list-inside mb-4">
-                    <li>
-                        Clone the GitHub repository: 📥
-                        <code>git clone https://github.com/unclecode/crawl4ai.git</code>
-                    </li>
-                    <li>Navigate to the project directory: 📂 <code>cd crawl4ai</code></li>
-                    <li>
-                        Build the Docker image: 🛠️ <code>docker build -t crawl4ai .</code> On Mac, follow: 🍎
-                        <code>docker build --platform linux/amd64 -t crawl4ai .</code>
-                    </li>
-                    <li>Run the Docker container: ▶️ <code>docker run -p 8000:80 crawl4ai</code></li>
-                </ol>
-                <p>
-                    For more detailed instructions and advanced configuration options, please refer to the 📚
-                    <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
-                </p>
-            </div>
-        </section>
-
-        <footer class="bg-gray-900 text-white py-4">
-            <div class="container mx-auto px-4">
-                <div class="flex justify-between items-center">
-                    <p>© 2024 Crawl4AI. All rights reserved.</p>
-                    <div class="social-links">
-                        <a
-                            href="https://github.com/unclecode/crawl4ai"
-                            class="text-white hover:text-gray-300 mx-2"
-                            target="_blank"
-                            >😺 GitHub</a
-                        >
-                        <a
-                            href="https://twitter.com/unclecode"
-                            class="text-white hover:text-gray-300 mx-2"
-                            target="_blank"
-                            >🐦 Twitter</a
-                        >
-                        <a
-                            href="https://discord.gg/your-invite-link"
-                            class="text-white hover:text-gray-300 mx-2"
-                            target="_blank"
-                            >💬 Discord</a
-                        >
-                    </div>
-                </div>
-            </div>
-        </footer>
-
-        <script>
-            // Get the selected provider model and token from local storage
-            const storedProviderModel = localStorage.getItem("provider_model");
-            const storedToken = localStorage.getItem(storedProviderModel);
-
-            if (storedProviderModel) {
-                document.getElementById("provider-model-select").value = storedProviderModel;
-            }
-
-            if (storedToken) {
-                document.getElementById("token-input").value = storedToken;
-            }
-
-            // Handle provider model dropdown change
-            document.getElementById("provider-model-select").addEventListener("change", () => {
-                const selectedProviderModel = document.getElementById("provider-model-select").value;
-                const storedToken = localStorage.getItem(selectedProviderModel);
-
-                if (storedToken) {
-                    document.getElementById("token-input").value = storedToken;
-                } else {
-                    document.getElementById("token-input").value = "";
-                }
-            });
-
-            // Fetch total count from the database
-            axios
-                .get("/total-count")
-                .then((response) => {
-                    document.getElementById("total-count").textContent = response.data.count;
-                })
-                .catch((error) => console.error(error));
-
-            // Handle crawl button click
-            document.getElementById("crawl-btn").addEventListener("click", () => {
-                // validate input to have both URL and API token
-                if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
-                    alert("Please enter both URL(s) and API token.");
-                    return;
-                }
-
-                const selectedProviderModel = document.getElementById("provider-model-select").value;
-                const apiToken = document.getElementById("token-input").value;
-                const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
-
-
-                // Save the selected provider model and token to local storage
-                localStorage.setItem("provider_model", selectedProviderModel);
-                localStorage.setItem(selectedProviderModel, apiToken);
-
-                const urlsInput = document.getElementById("url-input").value;
-                const urls = urlsInput.split(",").map((url) => url.trim());
-                const data = {
-                    urls: urls,
-                    provider_model: selectedProviderModel,
-                    api_token: apiToken,
-                    include_raw_html: true,
-                    forced: false,
-                    extract_blocks: extractBlocks,
-                };
-
-                // save api token to local storage
-                localStorage.setItem("api_token", document.getElementById("token-input").value);
-
-                document.getElementById("loading").classList.remove("hidden");
-                document.getElementById("result").classList.add("hidden");
-                document.getElementById("code_help").classList.add("hidden");
-
-                axios
-                    .post("/crawl", data)
-                    .then((response) => {
-                        const result = response.data.results[0];
-                        const parsedJson = JSON.parse(result.extracted_content);
-                        document.getElementById("json-result").textContent = JSON.stringify(parsedJson, null, 2);
-                        document.getElementById("cleaned-html-result").textContent = result.cleaned_html;
-                        document.getElementById("markdown-result").textContent = result.markdown;
-
-                        // Update code examples dynamically
-                        // Update code examples dynamically
-                        document.getElementById(
-                            "curl-code"
-                        ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
-                            ...data,
-                            api_token: "your_api_token",
-                        })}' http://localhost:8000/crawl`;
-
-                        document.getElementById(
-                            "python-code"
-                        ).textContent = `import requests\n\ndata = ${JSON.stringify(
-                            { ...data, api_token: "your_api_token" },
-                            null,
-                            2
-                        )}\n\nresponse = requests.post("http://localhost:8000/crawl", json=data)\nprint(response.json())`;
-
-                        document.getElementById(
-                            "nodejs-code"
-                        ).textContent = `const axios = require('axios');\n\nconst data = ${JSON.stringify(
-                            { ...data, api_token: "your_api_token" },
-                            null,
-                            2
-                        )};\n\naxios.post("http://localhost:8000/crawl", data)\n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
-                        // Highlight code syntax
-                        hljs.highlightAll();
-
-                        // Select JSON tab by default
-                        document.querySelector('.tab-btn[data-tab="json"]').click();
-
-                        document.getElementById("loading").classList.add("hidden");
-                        document.getElementById("result").classList.remove("hidden");
-                        document.getElementById("code_help").classList.remove("hidden");
-                    })
-                    .catch((error) => {
-                        console.error(error);
-                        document.getElementById("loading").classList.add("hidden");
-                    });
-            });
-
-            // Handle tab clicks
-            document.querySelectorAll(".tab-btn").forEach((btn) => {
-                btn.addEventListener("click", () => {
-                    const tab = btn.dataset.tab;
-                    document
-                        .querySelectorAll(".tab-btn")
-                        .forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
-                    btn.classList.add("bg-blue-600", "text-white");
-                    document.querySelectorAll(".tab-content.code pre").forEach((el) => el.classList.add("hidden"));
-                    document.getElementById(`${tab}-result`).parentElement.classList.remove("hidden");
-                });
-            });
-
-            // Handle code tab clicks
-            document.querySelectorAll(".code-tab-btn").forEach((btn) => {
-                btn.addEventListener("click", () => {
-                    const tab = btn.dataset.tab;
-                    document
-                        .querySelectorAll(".code-tab-btn")
-                        .forEach((b) => b.classList.remove("bg-blue-600", "text-white"));
-                    btn.classList.add("bg-blue-600", "text-white");
-                    document.querySelectorAll(".tab-content.result pre").forEach((el) => el.classList.add("hidden"));
-                    document.getElementById(`${tab}-code`).parentElement.classList.remove("hidden");
-                });
-            });
-
-            // Handle copy to clipboard button clicks
-            document.querySelectorAll(".copy-btn").forEach((btn) => {
-                btn.addEventListener("click", () => {
-                    const target = btn.dataset.target;
-                    const code = document.getElementById(target).textContent;
-                    navigator.clipboard.writeText(code).then(() => {
-                        btn.textContent = "Copied!";
-                        setTimeout(() => {
-                            btn.textContent = "Copy";
-                        }, 2000);
-                    });
-                });
-            });
-
-
-            document.getElementById("crawl-btn").addEventListener("click", () => {
-                const urlsInput = document.getElementById("url-input").value;
-                const urls = urlsInput.split(",").map(url => url.trim());
-                const apiToken = document.getElementById("token-input").value;
-                const selectedProviderModel = document.getElementById("provider-model-select").value;
-                const extractBlocks = document.getElementById("extract-blocks-checkbox").checked;
-            
-                const data = {
-                    urls: urls,
-                    provider_model: selectedProviderModel,
-                    api_token: apiToken,
-                    include_raw_html: true,
-                    forced: false,
-                    extract_blocks: extractBlocks
-                };
-            
-                localStorage.setItem("api_token", apiToken);
-            
-                document.getElementById("loading").classList.remove("hidden");
-                document.getElementById("result").classList.add("hidden");
-                document.getElementById("code_help").classList.add("hidden");
-            
-                axios.post("/crawl", data)
-                    .then(response => {
-                        const taskId = response.data.task_id;
-                        pollTaskStatus(taskId);
-                    })
-                    .catch(error => {
-                        console.error('Error during fetch:', error);
-                        document.getElementById("loading").classList.add("hidden");
-                    });
-            });
-            
-            function pollTaskStatus(taskId) {
-                axios.get(`/task/${taskId}`)
-                    .then(response => {
-                        const task = response.data;
-                        if (task.status === 'done') {
-                            displayResults(task.results[0]);
-                        } else if (task.status === 'pending') {
-                            setTimeout(() => pollTaskStatus(taskId), 2000);  // Poll every 2 seconds
-                        } else {
-                            console.error('Task failed:', task.error);
-                            document.getElementById("loading").classList.add("hidden");
-                        }
-                    })
-                    .catch(error => {
-                        console.error('Error polling task status:', error);
-                        document.getElementById("loading").classList.add("hidden");
-                    });
-            }
-        </script>
-    </body>
-</html>
--- a/pages/partial/footer.html
+++ b/pages/partial/footer.html
@@ -1,36 +0,0 @@
-<section class="hero bg-zinc-900 py-8 px-20 text-zinc-400">
-    <div class="container mx-auto px-4">
-        <h2 class="text-3xl font-bold mb-4">🤔 Why building this?</h2>
-        <p class="text-lg mb-4">
-            In recent times, we've witnessed a surge of startups emerging, riding the AI hype wave and charging
-            for services that should rightfully be accessible to everyone. 🌍💸 One such example is scraping and
-            crawling web pages and transforming them into a format suitable for Large Language Models (LLMs).
-            🕸️🤖 We believe that building a business around this is not the right approach; instead, it should
-            definitely be open-source. 🆓🌟 So, if you possess the skills to build such tools and share our
-            philosophy, we invite you to join our "Robinhood" band and help set these products free for the
-            benefit of all. 🤝💪
-        </p>
-    </div>
-</section>
-
-<footer class="bg-zinc-900 text-zinc-400 py-4">
-    <div class="container mx-auto px-4">
-        <div class="flex justify-between items-center">
-            <p>© 2024 Crawl4AI. All rights reserved.</p>
-            <div class="social-links">
-                <a
-                    href="https://github.com/unclecode/crawl4ai"
-                    class="text-zinc-400 hover:text-gray-300 mx-2"
-                    target="_blank"
-                    >😺 GitHub</a
-                >
-                <a
-                    href="https://twitter.com/unclecode"
-                    class="text-zinc-400 hover:text-gray-300 mx-2"
-                    target="_blank"
-                    >🐦 Twitter</a
-                >
-            </div>
-        </div>
-    </div>
-</footer>
--- a/pages/partial/how_to_guide.html
+++ b/pages/partial/how_to_guide.html
@@ -1,174 +0,0 @@
-<section id="how-to-guide" class="content-section">
-    <h1 class="text-2xl font-bold">How to Guide</h1>
-    <div class="flex flex-col gap-4 p-4 bg-zinc-900 text-lime-500">
-        <!-- Step 1 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🌟
-            <strong
-                >Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling
-                fun!</strong
-            >
-        </div>
-        <div class="">
-            First Step: Create an instance of WebCrawler and call the
-            <code>warmup()</code> function.
-        </div>
-        <div>
-            <pre><code class="language-python">crawler = WebCrawler()
-crawler.warmup()</code></pre>
-        </div>
-
-        <!-- Step 2 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
-        </div>
-        <div class="">First crawl (caches the result):</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business")</code></pre>
-        </div>
-        <div class="">Second crawl (Force to crawl again):</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)</code></pre>
-            <div class="bg-red-900 p-2 text-zinc-50">
-                ⚠️ Don't forget to set <code>`bypass_cache`</code> to True if you want to try different strategies for the same URL. Otherwise, the cached result will be returned. You can also set <code>`always_by_pass_cache`</code> in constructor to True to always bypass the cache.
-            </div>
-        </div>
-        <div class="">Crawl result without raw HTML content:</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)</code></pre>
-        </div>
-
-        <!-- Step 3 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            📄
-            <strong
-                >The 'include_raw_html' parameter, when set to True, includes the raw HTML content
-                in the response. By default, it is set to True.</strong
-            >
-        </div>
-        <div class="">Set <code>always_by_pass_cache</code> to True:</div>
-        <div>
-            <pre><code class="language-python">crawler.always_by_pass_cache = True</code></pre>
-        </div>
-        <!-- Step 3.5 Screenshot -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            📸
-            <strong>Let's take a screenshot of the page!</strong>
-        </div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    screenshot=True
-)
-with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result.screenshot))</code></pre>
-        </div>
-
-
-        <!-- Step 4 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
-        </div>
-        <div class="">Using RegexChunking:</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    chunking_strategy=RegexChunking(patterns=["\n\n"])
-)</code></pre>
-        </div>
-        <div class="">Using NlpSentenceChunking:</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    chunking_strategy=NlpSentenceChunking()
-)</code></pre>
-        </div>
-
-        <!-- Step 5 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
-        </div>
-        <div class="">Using CosineStrategy:</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3)
-)</code></pre>
-        </div>
-
-        <!-- Step 6 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🤖
-            <strong
-                >Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong
-            >
-        </div>
-        <div class="">Using LLMExtractionStrategy without instructions:</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
-)</code></pre>
-        </div>
-
-        <!-- Step 7 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            📜
-            <strong
-                >Let's make it even more interesting: LLMExtractionStrategy with
-                instructions!</strong
-            >
-        </div>
-        <div class="">Using LLMExtractionStrategy with instructions:</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    extraction_strategy=LLMExtractionStrategy(
-    provider="openai/gpt-4o",
-    api_token=os.getenv('OPENAI_API_KEY'),
-    instruction="I am interested in only financial news"
-)
-)</code></pre>
-        </div>
-
-        <!-- Step 8 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🎯
-            <strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
-        </div>
-        <div class="">Using CSS selector to extract H2 tags:</div>
-        <div>
-            <pre><code class="language-python">result = crawler.run(
-    url="https://www.nbcnews.com/business",
-    css_selector="h2"
-)</code></pre>
-        </div>
-
-        <!-- Step 9 -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🖱️
-            <strong
-                >Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong
-            >
-        </div>
-        <div class="">Using JavaScript to click 'Load More' button:</div>
-        <div>
-            <pre><code class="language-python">js_code = ["""
-const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-loadMoreButton && loadMoreButton.click();
-"""]
-crawler = WebCrawler(verbos=crawler_strategy, always_by_pass_cache=True)
-result = crawler.run(url="https://www.nbcnews.com/business", js = js_code)</code></pre>
-        <div class="">Remember that you can pass multiple JavaScript code snippets in the list. They all will be executed in the order they are passed.</div>
-        </div>
-
-        <!-- Conclusion -->
-        <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-            🎉
-            <strong
-                >Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth
-                and crawl the web like a pro! 🕸️</strong
-            >
-        </div>
-    </div>
-</section>
--- a/pages/partial/installation.html
+++ b/pages/partial/installation.html
@@ -1,65 +0,0 @@
-<section id="installation" class="content-section active">
-    <h1 class="text-2xl font-bold">Installation 💻</h1>
-    <p class="mb-4">
-        There are three ways to use Crawl4AI: 
-        <ol class="list-decimal list-inside mb-4">
-            <li class="">
-                As a library
-            </li>
-            <li class="">
-                As a local server (Docker)
-            </li>
-            <li class="">
-                As a Google Colab notebook. <a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
-                ><img
-                    src="https://colab.research.google.com/assets/colab-badge.svg"
-                    alt="Open In Colab"
-                    style="display: inline-block; width: 100px; height: 20px"
-            /></a>
-            </li>
-    </p>
-
-
-    <p class="my-4">To install Crawl4AI as a library, follow these steps:</p>
-
-    <ol class="list-decimal list-inside mb-4">
-        <li class="mb-4">
-            Install the package from GitHub:
-            <pre
-                class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-            ><code>virtualenv venv
-source venv/bin/activate
-pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
-            </code></pre>
-        </li>
-        <li class="mb-4">
-            Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
-            <pre
-                class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-            ><code>crawl4ai-download-models</code></pre>
-        </li>
-        <li class="mb-4">
-            Alternatively, you can clone the repository and install the package locally:
-            <pre
-                class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-            ><code  class = "language-python bash">virtualenv venv
-source venv/bin/activate
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai
-pip install -e .[all]
-</code></pre>
-        </li>
-        <li class="">
-            Use docker to run the local server:
-            <pre
-                class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-            ><code  class = "language-python bash">docker build -t crawl4ai . 
-# docker build --platform linux/amd64 -t crawl4ai . For Mac users
-docker run -d -p 8000:80 crawl4ai</code></pre>
-        </li>
-    </ol>
-    <p class="mb-4">
-        For more information about how to run Crawl4AI as a local server, please refer to the
-        <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
-    </p>
-</section>
--- a/pages/partial/try_it.html
+++ b/pages/partial/try_it.html
@@ -1,217 +0,0 @@
-<section class="try-it py-8 px-16 pb-20 bg-zinc-900 overflow-hidden">
-    <div class="container mx-auto ">
-        <h2 class="text-2xl font-bold mb-4 text-lime-500">Try It Now</h2>
-        <div class="flex gap-4">
-            <div class="flex flex-col flex-1 gap-2">
-                <div class="flex flex-col">
-                    <label for="url-input" class="text-lime-500 font-bold text-xs">URL(s)</label>
-                    <input
-                        type="text"
-                        id="url-input"
-                        value="https://www.nbcnews.com/business"
-                        class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300"
-                        placeholder="Enter URL(s) separated by commas"
-                    />
-                </div>
-                <div class="flex gap-2">
-                    <div class="flex flex-col">
-                        <label for="threshold" class="text-lime-500 font-bold text-xs">Min Words Threshold</label>
-                        <select
-                            id="threshold"
-                            class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
-                        >
-                            <option value="1">1</option>
-                            <option value="5">5</option>
-                            <option value="10" selected>10</option>
-                            <option value="15">15</option>
-                            <option value="20">20</option>
-                            <option value="25">25</option>
-                        </select>
-                    </div>
-                    <div class="flex flex-col flex-1">
-                        <label for="css-selector" class="text-lime-500 font-bold text-xs">CSS Selector</label>
-                        <input
-                            type="text"
-                            id="css-selector"
-                            class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-lime-700"
-                            placeholder="CSS Selector (e.g. .content, #main, article)"
-                        />
-                    </div>
-                </div>
-                <div class="flex gap-2">
-                    <div class="flex flex-col">
-                        <label for="extraction-strategy-select" class="text-lime-500 font-bold text-xs"
-                            >Extraction Strategy</label
-                        >
-                        <select
-                            id="extraction-strategy-select"
-                            class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
-                        >
-                            <option value="NoExtractionStrategy" selected>NoExtractionStrategy</option>
-                            <option value="CosineStrategy">CosineStrategy</option>
-                            <option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
-                        </select>
-                    </div>
-                    <div class="flex flex-col">
-                        <label for="chunking-strategy-select" class="text-lime-500 font-bold text-xs"
-                            >Chunking Strategy</label
-                        >
-                        <select
-                            id="chunking-strategy-select"
-                            class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
-                        >
-                            <option value="RegexChunking">RegexChunking</option>
-                            <option value="NlpSentenceChunking">NlpSentenceChunking</option>
-                            <option value="TopicSegmentationChunking">TopicSegmentationChunking</option>
-                            <option value="FixedLengthWordChunking">FixedLengthWordChunking</option>
-                            <option value="SlidingWindowChunking">SlidingWindowChunking</option>
-                        </select>
-                    </div>
-                </div>
-                <div id = "llm_settings" class="flex gap-2 hidden hidden">
-                    <div class="flex flex-col">
-                        <label for="provider-model-select" class="text-lime-500 font-bold text-xs"
-                            >Provider Model</label
-                        >
-                        <select
-                            id="provider-model-select"
-                            class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
-                        >
-                            <option value="groq/llama3-70b-8192">groq/llama3-70b-8192</option>
-                            <option value="groq/llama3-8b-8192">groq/llama3-8b-8192</option>
-                            <option value="groq/mixtral-8x7b-32768">groq/mixtral-8x7b-32768</option>
-                            <option value="openai/gpt-4-turbo">gpt-4-turbo</option>
-                            <option value="openai/gpt-3.5-turbo">gpt-3.5-turbo</option>
-                            <option value="openai/gpt-4o">gpt-4o</option>
-                            <option value="anthropic/claude-3-haiku-20240307">claude-3-haiku</option>
-                            <option value="anthropic/claude-3-opus-20240229">claude-3-opus</option>
-                            <option value="anthropic/claude-3-sonnet-20240229">claude-3-sonnet</option>
-                        </select>
-                    </div>
-                    <div class="flex flex-col flex-1">
-                        <label for="token-input" class="text-lime-500 font-bold text-xs">API Token</label>
-                        <input
-                            type="password"
-                            id="token-input"
-                            class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300"
-                            placeholder="Enter Groq API token"
-                        />
-                    </div>
-                </div>
-                <div  class="flex gap-2">
-                    <!-- Add two textarea one for getting Keyword Filter and another one Instruction, make both grow whole with-->
-                    <div id = "semantic_filter_div" class="flex flex-col flex-1 hidden">
-                        <label for="keyword-filter" class="text-lime-500 font-bold text-xs">Keyword Filter</label>
-                        <textarea
-                            id="semantic_filter"
-                            rows="3"
-                            class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-zinc-700"
-                            placeholder="Enter keywords for CosineStrategy to narrow down the content."
-                        ></textarea>
-                    </div>
-                    <div id = "instruction_div" class="flex flex-col flex-1 hidden">
-                        <label for="instruction" class="text-lime-500 font-bold text-xs">Instruction</label>
-                        <textarea
-                            id="instruction"
-                            rows="3"
-                            class="border border-zinc-700 rounded px-4 py-0 bg-zinc-900 text-zinc-300 placeholder-zinc-700"
-                            placeholder="Enter instruction for the LLMEstrategy to instruct the model."
-                        ></textarea>
-                    </div>
-                </div>
-                <div class="flex gap-3">
-                    <div class="flex items-center gap-2">
-                        <input type="checkbox" id="bypass-cache-checkbox" />
-                        <label for="bypass-cache-checkbox" class="text-lime-500 font-bold">Bypass Cache</label>
-                    </div>
-                    <div class="flex items-center gap-2">
-                        <input type="checkbox" id="screenshot-checkbox" checked />
-                        <label for="screenshot-checkbox" class="text-lime-500 font-bold">Screenshot</label>
-                    </div>
-                    <div class="flex items-center gap-2 hidden">
-                        <input type="checkbox" id="extract-blocks-checkbox" />
-                        <label for="extract-blocks-checkbox" class="text-lime-500 font-bold">Extract Blocks</label>
-                    </div>
-                    <button id="crawl-btn" class="bg-lime-600 text-black font-bold px-4 py-0 rounded">Crawl</button>
-                </div>
-            </div>
-
-            <div id="loading" class="hidden">
-                <p class="text-white">Loading... Please wait.</p>
-            </div>
-            <div id="result" class="flex-1  overflow-x-auto">
-                <div class="tab-buttons flex gap-2">
-                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
-                        JSON
-                    </button>
-                    <button
-                        class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                        data-tab="cleaned-html"
-                    >
-                        Cleaned HTML
-                    </button>
-                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="markdown">
-                        Markdown
-                    </button>
-                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="media">
-                        Medias
-                    </button>
-                    <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="screenshot">
-                        Screenshot
-                    </button>
-                </div>
-                <div class="tab-content code bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
-                    <pre class="h-full flex"><code id="json-result" class="language-json"></code></pre>
-                    <pre class="hidden h-full flex"><code id="cleaned-html-result" class="language-html"></code></pre>
-                    <pre class="hidden h-full flex"><code id="markdown-result" class="language-markdown"></code></pre>
-                    <pre class="hidden h-full flex"><code id="media-result" class="language-json"></code></pre>
-                    <pre class="hidden h-full flex"><code id="screenshot-result"></code></pre>
-                </div>
-            </div>
-
-            <div id="code_help" class="flex-1  overflow-x-auto">
-                <div class="tab-buttons flex gap-2">
-                    <button class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="curl">
-                        cURL
-                    </button>
-                    <button
-                        class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                        data-tab="library"
-                    >
-                        Python
-                    </button>
-                    <button
-                        class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                        data-tab="python"
-                    >
-                        REST API
-                    </button>
-                    <!-- <button
-                        class="code-tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500"
-                        data-tab="nodejs"
-                    >
-                        Node.js
-                    </button> -->
-                </div>
-                <div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
-                    <pre class="h-full flex relative overflow-x-auto">
-                        <code id="curl-code" class="language-bash"></code>
-                        <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
-                    </pre>
-                    <pre class="hidden h-full flex relative overflow-x-auto">
-                        <code id="python-code" class="language-python"></code>
-                        <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
-                    </pre>
-                    <pre class="hidden h-full flex relative overflow-x-auto">
-                        <code id="nodejs-code" class="language-javascript"></code>
-                        <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
-                    </pre>
-                    <pre class="hidden h-full flex relative overflow-x-auto">
-                        <code id="library-code" class="language-python"></code>
-                        <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
-                    </pre>
-                </div>
-            </div>
-        </div>
-    </div>
-</section>
--- a/pages/tmp.html
+++ b/pages/tmp.html
@@ -1,434 +0,0 @@
-<div class="w-3/4 p-4">
-    <section id="installation" class="content-section active">
-        <h1 class="text-2xl font-bold">Installation 💻</h1>
-        <p class="mb-4">There are three ways to use Crawl4AI:</p>
-        <ol class="list-decimal list-inside mb-4">
-            <li class="">As a library</li>
-            <li class="">As a local server (Docker)</li>
-            <li class="">
-                As a Google Colab notebook.
-                <a href="https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk"
-                    ><img
-                        src="https://colab.research.google.com/assets/colab-badge.svg"
-                        alt="Open In Colab"
-                        style="display: inline-block; width: 100px; height: 20px"
-                /></a>
-            </li>
-            <p></p>
-
-            <p class="my-4">To install Crawl4AI as a library, follow these steps:</p>
-
-            <ol class="list-decimal list-inside mb-4">
-                <li class="mb-4">
-                    Install the package from GitHub:
-                    <pre
-                        class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-                    ><code class="hljs language-bash">pip install git+https://github.com/unclecode/crawl4ai.git</code></pre>
-                </li>
-                <li class="mb-4">
-                    Alternatively, you can clone the repository and install the package locally:
-                    <pre
-                        class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-                    ><code class="language-python bash hljs">virtualenv venv
-source venv/<span class="hljs-built_in">bin</span>/activate
-git clone https://github.com/unclecode/crawl4ai.git
-cd crawl4ai
-pip install -e .
-</code></pre>
-                </li>
-                <li class="">
-                    Use docker to run the local server:
-                    <pre
-                        class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
-                    ><code class="language-python bash hljs">docker build -t crawl4ai . 
-<span class="hljs-comment"># docker build --platform linux/amd64 -t crawl4ai . For Mac users</span>
-docker run -d -p <span class="hljs-number">8000</span>:<span class="hljs-number">80</span> crawl4ai</code></pre>
-                </li>
-            </ol>
-            <p class="mb-4">
-                For more information about how to run Crawl4AI as a local server, please refer to the
-                <a href="https://github.com/unclecode/crawl4ai" class="text-blue-400">GitHub repository</a>.
-            </p>
-        </ol>
-    </section>
-    <section id="how-to-guide" class="content-section">
-        <h1 class="text-2xl font-bold">How to Guide</h1>
-        <div class="flex flex-col gap-4 p-4 bg-zinc-900 text-lime-500">
-            <!-- Step 1 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🌟
-                <strong>Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun!</strong>
-            </div>
-            <div class="">
-                First Step: Create an instance of WebCrawler and call the
-                <code>warmup()</code> function.
-            </div>
-            <div>
-                <pre><code class="language-python hljs">crawler = WebCrawler()
-crawler.warmup()</code></pre>
-            </div>
-
-            <!-- Step 2 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🧠 <strong>Understanding 'bypass_cache' and 'include_raw_html' parameters:</strong>
-            </div>
-            <div class="">First crawl (caches the result):</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>)</code></pre>
-            </div>
-            <div class="">Second crawl (Force to crawl again):</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>, bypass_cache=<span class="hljs-literal">True</span>)</code></pre>
-                <div class="bg-red-900 p-2 text-zinc-50">
-                    ⚠️ Don't forget to set <code>`bypass_cache`</code> to True if you want to try different strategies
-                    for the same URL. Otherwise, the cached result will be returned. You can also set
-                    <code>`always_by_pass_cache`</code> in constructor to True to always bypass the cache.
-                </div>
-            </div>
-            <div class="">Crawl result without raw HTML content:</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>, include_raw_html=<span class="hljs-literal">False</span>)</code></pre>
-            </div>
-
-            <!-- Step 3 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                📄
-                <strong
-                    >The 'include_raw_html' parameter, when set to True, includes the raw HTML content in the response.
-                    By default, it is set to True.</strong
-                >
-            </div>
-            <div class="">Set <code>always_by_pass_cache</code> to True:</div>
-            <div>
-                <pre><code class="language-python hljs">crawler.always_by_pass_cache = <span class="hljs-literal">True</span></code></pre>
-            </div>
-
-            <!-- Step 4 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🧩 <strong>Let's add a chunking strategy: RegexChunking!</strong>
-            </div>
-            <div class="">Using RegexChunking:</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(
-url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
-chunking_strategy=RegexChunking(patterns=[<span class="hljs-string">"\n\n"</span>])
-)</code></pre>
-            </div>
-            <div class="">Using NlpSentenceChunking:</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(
-url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
-chunking_strategy=NlpSentenceChunking()
-)</code></pre>
-            </div>
-
-            <!-- Step 5 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🧠 <strong>Let's get smarter with an extraction strategy: CosineStrategy!</strong>
-            </div>
-            <div class="">Using CosineStrategy:</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(
-url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
-extraction_strategy=CosineStrategy(word_count_threshold=<span class="hljs-number">20</span>, max_dist=<span class="hljs-number">0.2</span>, linkage_method=<span class="hljs-string">"ward"</span>, top_k=<span class="hljs-number">3</span>)
-)</code></pre>
-            </div>
-
-            <!-- Step 6 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🤖
-                <strong>Time to bring in the big guns: LLMExtractionStrategy without instructions!</strong>
-            </div>
-            <div class="">Using LLMExtractionStrategy without instructions:</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(
-url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
-extraction_strategy=LLMExtractionStrategy(provider=<span class="hljs-string">"openai/gpt-4o"</span>, api_token=os.getenv(<span class="hljs-string">'OPENAI_API_KEY'</span>))
-)</code></pre>
-            </div>
-
-            <!-- Step 7 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                📜
-                <strong>Let's make it even more interesting: LLMExtractionStrategy with instructions!</strong>
-            </div>
-            <div class="">Using LLMExtractionStrategy with instructions:</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(
-url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
-extraction_strategy=LLMExtractionStrategy(
-provider=<span class="hljs-string">"openai/gpt-4o"</span>,
-api_token=os.getenv(<span class="hljs-string">'OPENAI_API_KEY'</span>),
-instruction=<span class="hljs-string">"I am interested in only financial news"</span>
-)
-)</code></pre>
-            </div>
-
-            <!-- Step 8 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🎯
-                <strong>Targeted extraction: Let's use a CSS selector to extract only H2 tags!</strong>
-            </div>
-            <div class="">Using CSS selector to extract H2 tags:</div>
-            <div>
-                <pre><code class="language-python hljs">result = crawler.run(
-url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>,
-css_selector=<span class="hljs-string">"h2"</span>
-)</code></pre>
-            </div>
-
-            <!-- Step 9 -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🖱️
-                <strong>Let's get interactive: Passing JavaScript code to click 'Load More' button!</strong>
-            </div>
-            <div class="">Using JavaScript to click 'Load More' button:</div>
-            <div>
-                <pre><code class="language-python hljs">js_code = <span class="hljs-string">"""
-const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button =&gt; button.textContent.includes('Load More'));
-loadMoreButton &amp;&amp; loadMoreButton.click();
-"""</span>
-crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=<span class="hljs-literal">True</span>)
-result = crawler.run(url=<span class="hljs-string">"https://www.nbcnews.com/business"</span>)</code></pre>
-            </div>
-
-            <!-- Conclusion -->
-            <div class="col-span-2 bg-lime-800 p-2 rounded text-zinc-50">
-                🎉
-                <strong
-                    >Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the
-                    web like a pro! 🕸️</strong
-                >
-            </div>
-        </div>
-    </section>
-
-    <section id="chunking-strategies" class="content-section">
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>RegexChunking</h3>
-                <p>
-                    <code>RegexChunking</code> is a text chunking strategy that splits a given text into smaller parts
-                    using regular expressions. This is useful for preparing large texts for processing by language
-                    models, ensuring they are divided into manageable segments.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        <code>patterns</code> (list, optional): A list of regular expression patterns used to split the
-                        text. Default is to split by double newlines (<code>['\n\n']</code>).
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">chunker = RegexChunking(patterns=[r'\n\n', r'\. '])
-chunks = chunker.chunk("This is a sample text. It will be split into chunks.")
-</code></pre>
-            </div>
-        </div>
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>NlpSentenceChunking</h3>
-                <p>
-                    <code>NlpSentenceChunking</code> uses a natural language processing model to chunk a given text into
-                    sentences. This approach leverages SpaCy to accurately split text based on sentence boundaries.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        None.
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">chunker = NlpSentenceChunking()
-chunks = chunker.chunk("This is a sample text. It will be split into sentences.")
-</code></pre>
-            </div>
-        </div>
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>TopicSegmentationChunking</h3>
-                <p>
-                    <code>TopicSegmentationChunking</code> uses the TextTiling algorithm to segment a given text into
-                    topic-based chunks. This method identifies thematic boundaries in the text.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        <code>num_keywords</code> (int, optional): The number of keywords to extract for each topic
-                        segment. Default is <code>3</code>.
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">chunker = TopicSegmentationChunking(num_keywords=3)
-chunks = chunker.chunk("This is a sample text. It will be split into topic-based segments.")
-</code></pre>
-            </div>
-        </div>
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>FixedLengthWordChunking</h3>
-                <p>
-                    <code>FixedLengthWordChunking</code> splits a given text into chunks of fixed length, based on the
-                    number of words.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        <code>chunk_size</code> (int, optional): The number of words in each chunk. Default is
-                        <code>100</code>.
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">chunker = FixedLengthWordChunking(chunk_size=100)
-chunks = chunker.chunk("This is a sample text. It will be split into fixed-length word chunks.")
-</code></pre>
-            </div>
-        </div>
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>SlidingWindowChunking</h3>
-                <p>
-                    <code>SlidingWindowChunking</code> uses a sliding window approach to chunk a given text. Each chunk
-                    has a fixed length, and the window slides by a specified step size.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        <code>window_size</code> (int, optional): The number of words in each chunk. Default is
-                        <code>100</code>.
-                    </li>
-                    <li>
-                        <code>step</code> (int, optional): The number of words to slide the window. Default is
-                        <code>50</code>.
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">chunker = SlidingWindowChunking(window_size=100, step=50)
-chunks = chunker.chunk("This is a sample text. It will be split using a sliding window approach.")
-</code></pre>
-            </div>
-        </div>
-    </section>
-    <section id="extraction-strategies" class="content-section">
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>NoExtractionStrategy</h3>
-                <p>
-                    <code>NoExtractionStrategy</code> is a basic extraction strategy that returns the entire HTML
-                    content without any modification. It is useful for cases where no specific extraction is required.
-                    Only clean html, and amrkdown.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <p>None.</p>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">extractor = NoExtractionStrategy()
-extracted_content = extractor.extract(url, html)
-</code></pre>
-            </div>
-        </div>
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>LLMExtractionStrategy</h3>
-                <p>
-                    <code>LLMExtractionStrategy</code> uses a Language Model (LLM) to extract meaningful blocks or
-                    chunks from the given HTML content. This strategy leverages an external provider for language model
-                    completions.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        <code>provider</code> (str, optional): The provider to use for the language model completions.
-                        Default is <code>DEFAULT_PROVIDER</code> (e.g., openai/gpt-4).
-                    </li>
-                    <li>
-                        <code>api_token</code> (str, optional): The API token for the provider. If not provided, it will
-                        try to load from the environment variable <code>OPENAI_API_KEY</code>.
-                    </li>
-                    <li>
-                        <code>instruction</code> (str, optional): An instruction to guide the LLM on how to perform the
-                        extraction. This allows users to specify the type of data they are interested in or set the tone
-                        of the response. Default is <code>None</code>.
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">extractor = LLMExtractionStrategy(provider='openai', api_token='your_api_token', instruction='Extract only news about AI.')
-extracted_content = extractor.extract(url, html)
-</code></pre>
-                <p>
-                    By providing clear instructions, users can tailor the extraction process to their specific needs,
-                    enhancing the relevance and utility of the extracted content.
-                </p>
-            </div>
-        </div>
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>CosineStrategy</h3>
-                <p>
-                    <code>CosineStrategy</code> uses hierarchical clustering based on cosine similarity to extract
-                    clusters of text from the given HTML content. This strategy is suitable for identifying related
-                    content sections.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        <code>semantic_filter</code> (str, optional): A string containing keywords for filtering relevant
-                        documents before clustering. If provided, documents are filtered based on their cosine
-                        similarity to the keyword filter embedding. Default is <code>None</code>.
-                    </li>
-                    <li>
-                        <code>word_count_threshold</code> (int, optional): Minimum number of words per cluster. Default
-                        is <code>20</code>.
-                    </li>
-                    <li>
-                        <code>max_dist</code> (float, optional): The maximum cophenetic distance on the dendrogram to
-                        form clusters. Default is <code>0.2</code>.
-                    </li>
-                    <li>
-                        <code>linkage_method</code> (str, optional): The linkage method for hierarchical clustering.
-                        Default is <code>'ward'</code>.
-                    </li>
-                    <li>
-                        <code>top_k</code> (int, optional): Number of top categories to extract. Default is
-                        <code>3</code>.
-                    </li>
-                    <li>
-                        <code>model_name</code> (str, optional): The model name for embedding generation. Default is
-                        <code>'BAAI/bge-small-en-v1.5'</code>.
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">extractor = CosineStrategy(semantic_filter='artificial intelligence', word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='BAAI/bge-small-en-v1.5')
-extracted_content = extractor.extract(url, html)
-</code></pre>
-                <h4>Cosine Similarity Filtering</h4>
-                <p>
-                    When a <code>semantic_filter</code> is provided, the <code>CosineStrategy</code> applies an
-                    embedding-based filtering process to select relevant documents before performing hierarchical
-                    clustering.
-                </p>
-            </div>
-        </div>
-        <div class="bg-zinc-800 p-4 rounded shadow-md docs-item">
-            <div class="text-gray-300 prose prose-sm">
-                <h3>TopicExtractionStrategy</h3>
-                <p>
-                    <code>TopicExtractionStrategy</code> uses the TextTiling algorithm to segment the HTML content into
-                    topics and extracts keywords for each segment. This strategy is useful for identifying and
-                    summarizing thematic content.
-                </p>
-                <h4>Constructor Parameters:</h4>
-                <ul>
-                    <li>
-                        <code>num_keywords</code> (int, optional): Number of keywords to represent each topic segment.
-                        Default is <code>3</code>.
-                    </li>
-                </ul>
-                <h4>Example usage:</h4>
-                <pre><code class="language-python">extractor = TopicExtractionStrategy(num_keywords=3)
-extracted_content = extractor.extract(url, html)
-</code></pre>
-            </div>
-        </div>
-    </section>
-</div>
Author	SHA1	Message	Date
unclecode	de43505ae4	feat: update version to 0.3.742	2024-11-24 19:36:30 +08:00
unclecode	d7c5b900b8	feat: add support for arm64 platform in Docker commands and update INSTALL_TYPE variable in docker-compose	2024-11-24 19:35:53 +08:00
unclecode	edad7b6a74	chore: remove Railway deployment configuration and related documentation	2024-11-24 18:48:39 +08:00
UncleCode	829a1f7992	feat: update version to 0.3.741 and enhance content filtering with heuristic strategy. Fixing the issue that when the past HTML to BM25 content filter does not have any HTML elements.	2024-11-23 19:45:41 +08:00
UncleCode	d729aa7d5e	refactor: Add group ID to for images extracted from srcset.	2024-11-23 18:00:32 +08:00
UncleCode	0d0cef3438	feat: add enhanced markdown generation example with citations and file output	2024-11-22 20:14:58 +08:00
UncleCode	d7a112fefe	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-11-22 19:56:56 +08:00
UncleCode	a5decaa7cf	Merge branch '0.3.74'	2024-11-22 19:55:52 +08:00
程序员阿江(Relakkes)	3439f7886d	fix: crawler strategy exception handling and fixes (#271 )	2024-11-20 20:30:25 +08:00
Darwing Medina	d418a04602	Fix #260 prevent pass duplicated kwargs to scrapping_strategy (#269 ) Thank you for the suggestions. It totally makes sense now. Change to pop operator.	2024-11-20 18:52:11 +08:00
UncleCode	b654c49e55	Update .gitignore to exclude additional scripts and files	2024-11-19 19:32:06 +08:00
UncleCode	fbcff85ecb	Remove test files	2024-11-19 19:03:23 +08:00
UncleCode	788c67c29a	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-11-19 19:02:44 +08:00
UncleCode	2f19d38693	Update .gitignore to include .gitboss/ and todo_executor.md	2024-11-19 19:02:41 +08:00
ntohidikplay	3aae30ed2a	test1: trying to push to main	2024-11-19 11:57:07 +01:00
ntohidikplay	593c7ad307	test: trying to push to main	2024-11-19 11:45:26 +01:00