Merge branch 'main' into ssh-server

Merge branch 'main' of https://github.com/unclecode/crawl4ai
remove crawl endpoints
2024-12-12 12:25:26 +00:00 · 2024-12-12 12:25:01 +00:00 · 2024-12-12 12:24:13 +00:00 · 2024-12-08 12:06:53 +00:00 · 2024-12-04 12:31:45 +00:00 · 2024-12-04 12:31:41 +00:00
15 changed files with 398 additions and 167 deletions
--- a/.do/deploy.template.yaml
+++ b/.do/deploy.template.yaml
@@ -0,0 +1,22 @@
+spec:
+  name: crawl4ai
+  services:
+    - name: crawl4ai
+      git:
+        branch: 0.3.74
+        repo_clone_url: https://github.com/unclecode/crawl4ai.git
+      dockerfile_path: Dockerfile
+      http_port: 11235
+      instance_count: 1
+      instance_size_slug: professional-xs
+      health_check:
+        http_path: /health
+      envs:
+        - key: INSTALL_TYPE
+          value: "basic"
+        - key: PYTHON_VERSION  
+          value: "3.10"
+        - key: ENABLE_GPU
+          value: "false"
+      routes:
+        - path: /
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1 @@
-include requirements.txt
-recursive-include crawl4ai/js_snippet *.js
+include requirements.txt
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
+# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.

 <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.22"
+__version__ = "0.4.2"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -7,7 +7,6 @@ from .config import (
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy
-from .markdown_generation_strategy import MarkdownGenerationStrategy

 class BrowserConfig:
    """
@@ -270,7 +269,6 @@ class CrawlerRunConfig:
        word_count_threshold: int =  MIN_WORD_THRESHOLD ,
        extraction_strategy : ExtractionStrategy=None,  # Will default to NoExtractionStrategy if None
        chunking_strategy : ChunkingStrategy= None,    # Will default to RegexChunking if None
-        markdown_generator : MarkdownGenerationStrategy = None,
        content_filter=None,
        cache_mode=None,
        session_id: str = None,
@@ -311,7 +309,6 @@ class CrawlerRunConfig:
        self.word_count_threshold = word_count_threshold
        self.extraction_strategy = extraction_strategy
        self.chunking_strategy = chunking_strategy
-        self.markdown_generator = markdown_generator
        self.content_filter = content_filter
        self.cache_mode = cache_mode
        self.session_id = session_id
@@ -367,7 +364,6 @@ class CrawlerRunConfig:
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
            chunking_strategy=kwargs.get("chunking_strategy"),
-            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
            cache_mode=kwargs.get("cache_mode"),
            session_id=kwargs.get("session_id"),
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -7,8 +7,7 @@ from pathlib import Path
 from typing import Optional, List, Union
 import json
 import asyncio
-# from contextlib import nullcontext, asynccontextmanager
-from contextlib import asynccontextmanager
+from contextlib import nullcontext, asynccontextmanager
 from .models import CrawlResult, MarkdownGenerationResult
 from .async_database import async_db_manager
 from .chunking_strategy import *
@@ -16,7 +15,6 @@ from .content_filter_strategy import *
 from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
-from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
 from .content_scraping_strategy import WebScrapingStrategy
 from .async_logger import AsyncLogger
 from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -134,12 +132,17 @@ class AsyncWebCrawler:

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
+
+    @asynccontextmanager
+    async def nullcontext(self):
+        yield
    
    async def awarmup(self):
        """Initialize the crawler with warm-up sequence."""
        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
        self.ready = True

+
    @asynccontextmanager
    async def nullcontext(self):
        """异步空上下文管理器"""
@@ -320,8 +323,7 @@ class AsyncWebCrawler:
                        config=config,  # Pass the config object instead of individual parameters
                        screenshot=screenshot_data,
                        pdf_data=pdf_data,
-                        verbose=config.verbose,
-                        **kwargs
+                        verbose=config.verbose
                    )

                    # Set response data
@@ -422,8 +424,7 @@ class AsyncWebCrawler:
                    css_selector=config.css_selector,
                    only_text=config.only_text,
                    image_description_min_word_threshold=config.image_description_min_word_threshold,
-                    content_filter=config.content_filter,
-                    **kwargs
+                    content_filter=config.content_filter
                )

                if result is None:
@@ -434,29 +435,16 @@ class AsyncWebCrawler:
            except Exception as e:
                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")

-       
-
            # Extract results
+            markdown_v2 = result.get("markdown_v2", None)
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            markdown = sanitize_input_encode(result.get("markdown", ""))
            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
            fit_html = sanitize_input_encode(result.get("fit_html", ""))
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})

-            # Markdown Generation
-            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
-            if not config.content_filter and not markdown_generator.content_filter:
-                markdown_generator.content_filter = PruningContentFilter()
-            
-            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
-                cleaned_html=cleaned_html,
-                base_url=url,
-                # html2text_options=kwargs.get('html2text', {})
-            )
-            markdown_v2 = markdown_result
-            markdown = sanitize_input_encode(markdown_result.raw_markdown)
-
            # Log processing completion
            self.logger.info(
                message="Processed {url:.50}... | Time: {timing}ms",
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):

        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')

-        # markdown_content = self._generate_markdown_content(
-        #     cleaned_html=cleaned_html,
-        #     html=html,
-        #     url=url,
-        #     success=success,
-        #     **kwargs
-        # )
+        markdown_content = self._generate_markdown_content(
+            cleaned_html=cleaned_html,
+            html=html,
+            url=url,
+            success=success,
+            **kwargs
+        )
        
        return {
-            # **markdown_content,
+            **markdown_content,
            'cleaned_html': cleaned_html,
            'success': success,
            'media': media,
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,40 +1,41 @@
+import os
+import time
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
-import asyncio
-from pydantic import BaseModel, Field

 url = r'https://openai.com/api/pricing/'

+crawler = WebCrawler()
+crawler.warmup()
+
+from pydantic import BaseModel, Field
+
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")

-from crawl4ai import AsyncWebCrawler
+result = crawler.run(
+    url=url,
+    word_count_threshold=1,
+    extraction_strategy= LLMExtractionStrategy(
+        # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
+        provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), 
+        schema=OpenAIModelFee.model_json_schema(),
+        extraction_type="schema",
+        instruction="From the crawled content, extract all mentioned model names along with their "\
+            "fees for input and output tokens. Make sure not to miss anything in the entire content. "\
+            'One extracted model JSON format should look like this: '\
+            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
+    ),
+    bypass_cache=True,
+)

-async def main():
-    # Use AsyncWebCrawler
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url=url,
-            word_count_threshold=1,
-            extraction_strategy= LLMExtractionStrategy(
-                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
-                provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
-                schema=OpenAIModelFee.model_json_schema(),
-                extraction_type="schema",
-                instruction="From the crawled content, extract all mentioned model names along with their " \
-                            "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
-                            'One extracted model JSON format should look like this: ' \
-                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
-            ),
+model_fees = json.loads(result.extracted_content)

-        )
-        print("Success:", result.success)
-        model_fees = json.loads(result.extracted_content)
-        print(len(model_fees))
+print(len(model_fees))

-        with open(".data/data.json", "w", encoding="utf-8") as f:
-            f.write(result.extracted_content)
-
-asyncio.run(main())
+with open(".data/data.json", "w", encoding="utf-8") as f:
+    f.write(result.extracted_content)
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -142,7 +142,6 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=1,
-        page_timeout = 80000,
        extraction_strategy=LLMExtractionStrategy(
            provider=provider,
            api_token=api_token,
@@ -498,21 +497,21 @@ async def main():
    
    # Advanced examples
    # await extract_structured_data_using_css_extractor()
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    
    # Browser comparisons
-    # await crawl_custom_browser_type()
+    await crawl_custom_browser_type()
    
    # Performance testing
    # await speed_comparison()

    # Screenshot example
-    # await capture_and_save_screenshot(
-    #     "https://www.example.com",
-    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
-    # )
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )

 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -239,10 +239,8 @@ async def crawl_dynamic_content_pages_method_1():
        all_commits = []

        js_next_page = """
-        (() => {
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-        })();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
        """

        for page in range(3):  # Crawl 3 pages
@@ -606,14 +604,14 @@ async def fit_markdown_remove_overlay():


 async def main():
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    
-    # await simple_crawl()
-    # await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
-    # # await use_proxy()
-    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    # await extract_structured_data_using_css_extractor()
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
+    # await use_proxy()
+    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    await extract_structured_data_using_css_extractor()

    # LLM extraction examples
    # await extract_structured_data_using_llm()
--- a/docs/md/demo.md
+++ b/docs/md/demo.md
@@ -0,0 +1,231 @@
+# Interactive Demo for Crowler
+<div id="demo">
+    <form id="crawlForm" class="terminal-form">
+        <fieldset>
+            <legend>Enter URL and Options</legend>
+            <div class="form-group">
+                <label for="url">Enter URL:</label>
+                <input type="text" id="url" name="url" required>
+            </div>
+            <div class="form-group">
+                <label for="screenshot">Get Screenshot:</label>
+                <input type="checkbox" id="screenshot" name="screenshot">
+            </div>
+            <div class="form-group">
+                <button class="btn btn-default" type="submit">Submit</button>
+            </div>
+
+        </fieldset>
+    </form>
+
+    <div id="loading" class="loading-message">
+        <div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
+    </div>
+
+    <section id="response" class="response-section">
+        <h2>Response</h2>
+        <div class="tabs">
+            <ul class="tab-list">
+                <li class="tab-item" onclick="showTab('markdown')">Markdown</li>
+                <li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
+                <li class="tab-item" onclick="showTab('media')">Media</li>
+                <li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
+                <li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
+                <li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
+            </ul>
+            <div class="tab-content" id="tab-markdown">
+                <header>
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-cleanedHtml" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-media" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="mediaContent" class="language-json hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-extractedContent" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
+            </div>
+
+            <div class="tab-content" id="tab-screenshot" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
+                    </div>
+                </header>
+                <pre><img id="screenshotContent" /></pre>
+            </div>
+
+            <div class="tab-content" id="tab-pythonCode" style="display: none;">
+                <header >
+                    <div>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
+                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
+                    </div>
+                </header>
+                <pre><code id="pythonCode" class="language-python hljs"></code></pre>
+            </div>
+        </div>
+    </section>
+
+    <div id="error" class="error-message" style="display: none; margin-top:1em;">
+        <div class="terminal-alert terminal-alert-error"></div>
+    </div>
+
+    <script>
+        function showTab(tabId) {
+            const tabs = document.querySelectorAll('.tab-content');
+            tabs.forEach(tab => tab.style.display = 'none');
+            document.getElementById(`tab-${tabId}`).style.display = 'block';
+        }
+
+        function redo(codeBlock, codeText){
+            codeBlock.classList.remove('hljs');
+            codeBlock.removeAttribute('data-highlighted');
+
+            // Set new code and re-highlight
+            codeBlock.textContent = codeText;
+            hljs.highlightBlock(codeBlock);
+        }
+
+        function copyToClipboard(elementId) {
+            const content = document.getElementById(elementId).textContent;
+            navigator.clipboard.writeText(content).then(() => {
+                alert('Copied to clipboard');
+            });
+        }
+
+        function downloadContent(elementId, filename) {
+            const content = document.getElementById(elementId).textContent;
+            const blob = new Blob([content], { type: 'text/plain' });
+            const url = window.URL.createObjectURL(blob);
+            const a = document.createElement('a');
+            a.style.display = 'none';
+            a.href = url;
+            a.download = filename;
+            document.body.appendChild(a);
+            a.click();
+            window.URL.revokeObjectURL(url);
+            document.body.removeChild(a);
+        }
+
+        function downloadImage(elementId, filename) {
+            const content = document.getElementById(elementId).src;
+            const a = document.createElement('a');
+            a.style.display = 'none';
+            a.href = content;
+            a.download = filename;
+            document.body.appendChild(a);
+            a.click();
+            document.body.removeChild(a);
+        }
+
+        document.getElementById('crawlForm').addEventListener('submit', function(event) {
+            event.preventDefault();
+            document.getElementById('loading').style.display = 'block';
+            document.getElementById('response').style.display = 'none';
+
+            const url = document.getElementById('url').value;
+            const screenshot = document.getElementById('screenshot').checked;
+            const data = {
+                urls: [url],
+                bypass_cache: false,
+                word_count_threshold: 5,
+                screenshot: screenshot
+            };
+
+            fetch('https://crawl4ai.com/crawl', {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json'
+                },
+                body: JSON.stringify(data)
+            })
+            .then(response => {
+                if (!response.ok) {
+                    if (response.status === 429) {
+                        return response.json().then(err => { 
+                            throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
+                        });
+                    }
+                    throw new Error('Network response was not ok');
+                }
+                return response.json();
+            })
+            .then(data => {
+                data = data.results[0]; // Only one URL is requested
+                document.getElementById('loading').style.display = 'none';
+                document.getElementById('response').style.display = 'block';
+                redo(document.getElementById('markdownContent'), data.markdown);
+                redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
+                redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
+                redo(document.getElementById('extractedContentContent'), data.extracted_content);
+                if (screenshot) {
+                    document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
+                }
+                const pythonCode = `
+from crawl4ai.web_crawler import WebCrawler
+
+crawler = WebCrawler()
+crawler.warmup()
+
+result = crawler.run(
+    url='${url}',
+    screenshot=${screenshot}
+)
+print(result)
+                `;
+                redo(document.getElementById('pythonCode'), pythonCode);
+                document.getElementById('error').style.display = 'none';
+            })
+            .catch(error => {
+                document.getElementById('loading').style.display = 'none';
+                document.getElementById('error').style.display = 'block';
+                let errorMessage = 'An unexpected error occurred. Please try again later.';
+                
+                if (error.status === 429) {
+                    const details = error.details;
+                    if (details.retry_after) {
+                        errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
+                    } else if (details.reset_at) {
+                        const resetTime = new Date(details.reset_at);
+                        const waitTime = Math.ceil((resetTime - new Date()) / 1000);
+                        errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
+                    } else {
+                        errorMessage = `Rate limit exceeded. Please try again later.`;
+                    }
+                } else if (error.message) {
+                    errorMessage = error.message;
+                }
+                
+                document.querySelector('#error .terminal-alert').textContent = errorMessage;
+            });
+        });
+    </script>
+</div>
--- a/docs/md_v2/basic/simple-crawling.md
+++ b/docs/md_v2/basic/simple-crawling.md
@@ -99,7 +99,7 @@ async def main():
            remove_overlay_elements=True,
            
            # Cache control
-            cache_mode=CacheMode.ENABLED  # Use cache if available
+            cache_mode=CacheMode.ENABLE  # Use cache if available
        )
        
        if result.success:
--- a/main.py
+++ b/main.py
@@ -380,97 +380,97 @@ def read_root():
    return {"message": "Crawl4AI API service is running"}


-@app.post("/crawl", dependencies=[Depends(verify_token)])
-async def crawl(request: CrawlRequest) -> Dict[str, str]:
-    task_id = await crawler_service.submit_task(request)
-    return {"task_id": task_id}
+# @app.post("/crawl", dependencies=[Depends(verify_token)])
+# async def crawl(request: CrawlRequest) -> Dict[str, str]:
+#     task_id = await crawler_service.submit_task(request)
+#     return {"task_id": task_id}

-@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
-async def get_task_status(task_id: str):
-    task_info = crawler_service.task_manager.get_task(task_id)
-    if not task_info:
-        raise HTTPException(status_code=404, detail="Task not found")
+# @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
+# async def get_task_status(task_id: str):
+#     task_info = crawler_service.task_manager.get_task(task_id)
+#     if not task_info:
+#         raise HTTPException(status_code=404, detail="Task not found")

-    response = {
-        "status": task_info.status,
-        "created_at": task_info.created_at,
-    }
+#     response = {
+#         "status": task_info.status,
+#         "created_at": task_info.created_at,
+#     }

-    if task_info.status == TaskStatus.COMPLETED:
-        # Convert CrawlResult to dict for JSON response
-        if isinstance(task_info.result, list):
-            response["results"] = [result.dict() for result in task_info.result]
-        else:
-            response["result"] = task_info.result.dict()
-    elif task_info.status == TaskStatus.FAILED:
-        response["error"] = task_info.error
+#     if task_info.status == TaskStatus.COMPLETED:
+#         # Convert CrawlResult to dict for JSON response
+#         if isinstance(task_info.result, list):
+#             response["results"] = [result.dict() for result in task_info.result]
+#         else:
+#             response["result"] = task_info.result.dict()
+#     elif task_info.status == TaskStatus.FAILED:
+#         response["error"] = task_info.error

-    return response
+#     return response

-@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
-async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
-    task_id = await crawler_service.submit_task(request)
+# @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
+# async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
+#     task_id = await crawler_service.submit_task(request)
    
-    # Wait up to 60 seconds for task completion
-    for _ in range(60):
-        task_info = crawler_service.task_manager.get_task(task_id)
-        if not task_info:
-            raise HTTPException(status_code=404, detail="Task not found")
+#     # Wait up to 60 seconds for task completion
+#     for _ in range(60):
+#         task_info = crawler_service.task_manager.get_task(task_id)
+#         if not task_info:
+#             raise HTTPException(status_code=404, detail="Task not found")
            
-        if task_info.status == TaskStatus.COMPLETED:
-            # Return same format as /task/{task_id} endpoint
-            if isinstance(task_info.result, list):
-                return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
-            return {"status": task_info.status, "result": task_info.result.dict()}
+#         if task_info.status == TaskStatus.COMPLETED:
+#             # Return same format as /task/{task_id} endpoint
+#             if isinstance(task_info.result, list):
+#                 return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
+#             return {"status": task_info.status, "result": task_info.result.dict()}
            
-        if task_info.status == TaskStatus.FAILED:
-            raise HTTPException(status_code=500, detail=task_info.error)
+#         if task_info.status == TaskStatus.FAILED:
+#             raise HTTPException(status_code=500, detail=task_info.error)
            
-        await asyncio.sleep(1)
+#         await asyncio.sleep(1)
    
-    # If we get here, task didn't complete within timeout
-    raise HTTPException(status_code=408, detail="Task timed out")
+#     # If we get here, task didn't complete within timeout
+#     raise HTTPException(status_code=408, detail="Task timed out")

-@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
-async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
-    try:
-        crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
-        extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
+# @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
+# async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
+#     try:
+#         crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
+#         extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
        
-        try:
-            if isinstance(request.urls, list):
-                results = await crawler.arun_many(
-                    urls=[str(url) for url in request.urls],
-                    extraction_strategy=extraction_strategy,
-                    js_code=request.js_code,
-                    wait_for=request.wait_for,
-                    css_selector=request.css_selector,
-                    screenshot=request.screenshot,
-                    magic=request.magic,
-                    cache_mode=request.cache_mode,
-                    session_id=request.session_id,
-                    **request.extra,
-                )
-                return {"results": [result.dict() for result in results]}
-            else:
-                result = await crawler.arun(
-                    url=str(request.urls),
-                    extraction_strategy=extraction_strategy,
-                    js_code=request.js_code,
-                    wait_for=request.wait_for,
-                    css_selector=request.css_selector,
-                    screenshot=request.screenshot,
-                    magic=request.magic,
-                    cache_mode=request.cache_mode,
-                    session_id=request.session_id,
-                    **request.extra,
-                )
-                return {"result": result.dict()}
-        finally:
-            await crawler_service.crawler_pool.release(crawler)
-    except Exception as e:
-        logger.error(f"Error in direct crawl: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
+#         try:
+#             if isinstance(request.urls, list):
+#                 results = await crawler.arun_many(
+#                     urls=[str(url) for url in request.urls],
+#                     extraction_strategy=extraction_strategy,
+#                     js_code=request.js_code,
+#                     wait_for=request.wait_for,
+#                     css_selector=request.css_selector,
+#                     screenshot=request.screenshot,
+#                     magic=request.magic,
+#                     cache_mode=request.cache_mode,
+#                     session_id=request.session_id,
+#                     **request.extra,
+#                 )
+#                 return {"results": [result.dict() for result in results]}
+#             else:
+#                 result = await crawler.arun(
+#                     url=str(request.urls),
+#                     extraction_strategy=extraction_strategy,
+#                     js_code=request.js_code,
+#                     wait_for=request.wait_for,
+#                     css_selector=request.css_selector,
+#                     screenshot=request.screenshot,
+#                     magic=request.magic,
+#                     cache_mode=request.cache_mode,
+#                     session_id=request.session_id,
+#                     **request.extra,
+#                 )
+#                 return {"result": result.dict()}
+#         finally:
+#             await crawler_service.crawler_pool.release(crawler)
+#     except Exception as e:
+#         logger.error(f"Error in direct crawl: {str(e)}")
+#         raise HTTPException(status_code=500, detail=str(e))
    
@app.get("/health")
 async def health_check():
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
 nav:
  - Home: 'index.md'
  - 'Installation': 'basic/installation.md'
-  - 'Docker Deplotment': 'basic/docker-deploymeny.md'
+  - 'Docker Deployment': 'basic/docker-deploymeny.md'
  - 'Quick Start': 'basic/quickstart.md'
  - Changelog & Blog:
    - 'Blog Home': 'blog/index.md'
--- a/setup.py
+++ b/setup.py
@@ -57,9 +57,6 @@ setup(
    author_email="unclecode@kidocode.com",
    license="MIT",
    packages=find_packages(),
-    package_data={
-        'crawl4ai': ['js_snippet/*.js']  # This matches the exact path structure
-    },
    install_requires=default_requirements
    + ["playwright", "aiofiles"],  # Added aiofiles
    extras_require={
Author	SHA1	Message	Date
Unclecode	b1ac4fe023	Merge branch 'main' into ssh-server	2024-12-12 12:25:26 +00:00
Unclecode	a3c92141a1	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-12-12 12:25:01 +00:00
Unclecode	3fd777dd6f	remove crawl endpoints	2024-12-12 12:24:13 +00:00
Unclecode	d7200138a0	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-12-08 12:06:53 +00:00
Unclecode	be37abe05a	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-12-04 12:31:45 +00:00
Unclecode	90ba51b52f	fix(mkdocs): correct typo in Docker Deployment navigation entry	2024-12-04 12:31:41 +00:00
Unclecode	11721eb0ce	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-11-05 13:02:59 +00:00
Unclecode	1222e456fb	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-11-05 12:58:30 +00:00
Unclecode	e8aaa57cb2	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-10-30 12:59:34 +00:00
Unclecode	a661b3173d	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-10-30 12:47:07 +00:00
Unclecode	b781b6df96	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-10-27 11:42:23 +00:00
Unclecode	14e537fdd3	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-08-04 06:57:16 +00:00
Unclecode	64b33af0e0	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-08-02 08:04:54 +00:00
Unclecode	1afcdb6996	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-07-08 12:24:13 +00:00
Unclecode	ca625b3152	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-07-08 12:02:19 +00:00
Unclecode	6521b4745f	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-07-08 08:35:49 +00:00
Unclecode	241862bfe6	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-07-03 07:27:37 +00:00
Unclecode	f2491b6c1a	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-29 16:34:15 +00:00
Unclecode	886622cb1e	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-29 16:23:44 +00:00
Unclecode	13dc254438	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-26 07:35:06 +00:00
Unclecode	096929153f	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-26 05:45:25 +00:00
Unclecode	7e95c38acb	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-24 14:40:48 +00:00
Unclecode	c697bf23e4	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-22 16:37:27 +00:00
Unclecode	b951d34ed0	chore: Update fetch URL to use HTTPS	2024-06-22 16:37:21 +00:00
Unclecode	c8a10dc455	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-22 12:54:41 +00:00
Unclecode	9e0ded8da0	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-22 12:41:52 +00:00
Unclecode	48c27899b7	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-22 12:38:14 +00:00
Unclecode	3c32b0abed	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-21 09:58:17 +00:00
Unclecode	a215ec08d6	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-19 10:51:31 +00:00
Unclecode	5d3fef45f7	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-18 12:02:29 +00:00
Unclecode	77df6db453	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-08 10:38:10 +00:00
Unclecode	2124652327	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-08 10:07:30 +00:00
Unclecode	255bde70c9	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-08 08:53:54 +00:00
Unclecode	04808b5dc9	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-07 12:44:41 +00:00
Unclecode	b3a150f3d1	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-07 08:26:43 +00:00
Unclecode	de80a2da09	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-07 08:25:49 +00:00
Unclecode	df4cda8322	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-07 08:24:46 +00:00
Unclecode	7717a3b948	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-07 08:19:37 +00:00
Unclecode	a4a6b2075f	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-07 08:18:19 +00:00
Unclecode	4010558885	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-02 08:12:32 +00:00
Unclecode	b0cf5076da	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-02 08:09:25 +00:00
Unclecode	0d6e9e37ca	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-02 08:06:56 +00:00
Unclecode	9b0f71ba88	Merge branch 'main' of https://github.com/unclecode/crawl4ai	2024-06-02 07:56:00 +00:00
Unclecode	6ddccc144c	chore: Bump version to 0.2.2 in setup.py	2024-05-19 16:19:40 +00:00