Delete a.md

Fix #340 example llm_extraction (#358 )
@Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.
2024-12-25 19:43:39 +08:00 · 2024-12-24 19:56:07 +08:00 · 2024-12-15 19:49:38 +08:00 · 2024-12-15 19:49:30 +08:00 · 2024-12-13 21:51:38 +08:00
13 changed files with 165 additions and 4588 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
-include requirements.txt
+include requirements.txt
 recursive-include crawl4ai/js_snippet *.js
--- a/a.md
+++ b/a.md
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.2"
+__version__ = "0.4.22"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -7,6 +7,7 @@ from .config import (
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy
 from .markdown_generation_strategy import MarkdownGenerationStrategy
 class BrowserConfig:
    """
@@ -269,6 +270,7 @@ class CrawlerRunConfig:
        word_count_threshold: int =  MIN_WORD_THRESHOLD ,
        extraction_strategy : ExtractionStrategy=None,  # Will default to NoExtractionStrategy if None
        chunking_strategy : ChunkingStrategy= None,    # Will default to RegexChunking if None
        markdown_generator : MarkdownGenerationStrategy = None,
        content_filter=None,
        cache_mode=None,
        session_id: str = None,
@@ -309,6 +311,7 @@ class CrawlerRunConfig:
        self.word_count_threshold = word_count_threshold
        self.extraction_strategy = extraction_strategy
        self.chunking_strategy = chunking_strategy
        self.markdown_generator = markdown_generator
        self.content_filter = content_filter
        self.cache_mode = cache_mode
        self.session_id = session_id
@@ -364,6 +367,7 @@ class CrawlerRunConfig:
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
            chunking_strategy=kwargs.get("chunking_strategy"),
            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
            cache_mode=kwargs.get("cache_mode"),
            session_id=kwargs.get("session_id"),
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -7,7 +7,8 @@ from pathlib import Path
 from typing import Optional, List, Union
 import json
 import asyncio
-from contextlib import nullcontext, asynccontextmanager
+# from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
 from .models import CrawlResult, MarkdownGenerationResult
 from .async_database import async_db_manager
 from .chunking_strategy import *
@@ -15,6 +16,7 @@ from .content_filter_strategy import *
 from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
 from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
 from .content_scraping_strategy import WebScrapingStrategy
 from .async_logger import AsyncLogger
 from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -132,17 +134,12 @@ class AsyncWebCrawler:
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
    @asynccontextmanager
    async def nullcontext(self):
        yield
    async def awarmup(self):
        """Initialize the crawler with warm-up sequence."""
        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
        self.ready = True
    @asynccontextmanager
    async def nullcontext(self):
        """异步空上下文管理器"""
@@ -323,7 +320,8 @@ class AsyncWebCrawler:
                        config=config,  # Pass the config object instead of individual parameters
                        screenshot=screenshot_data,
                        pdf_data=pdf_data,
-                        verbose=config.verbose
+                        verbose=config.verbose,
                        **kwargs
                    )
                    # Set response data
@@ -424,7 +422,8 @@ class AsyncWebCrawler:
                    css_selector=config.css_selector,
                    only_text=config.only_text,
                    image_description_min_word_threshold=config.image_description_min_word_threshold,
-                    content_filter=config.content_filter
+                    content_filter=config.content_filter,
                    **kwargs
                )
                if result is None:
@@ -435,16 +434,29 @@ class AsyncWebCrawler:
            except Exception as e:
                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
            # Extract results
            markdown_v2 = result.get("markdown_v2", None)
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
            markdown = sanitize_input_encode(result.get("markdown", ""))
            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
            fit_html = sanitize_input_encode(result.get("fit_html", ""))
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})
            # Markdown Generation
            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
            if not config.content_filter and not markdown_generator.content_filter:
                markdown_generator.content_filter = PruningContentFilter()
            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                cleaned_html=cleaned_html,
                base_url=url,
                # html2text_options=kwargs.get('html2text', {})
            )
            markdown_v2 = markdown_result
            markdown = sanitize_input_encode(markdown_result.raw_markdown)
            # Log processing completion
            self.logger.info(
                message="Processed {url:.50}... | Time: {timing}ms",
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')
-        markdown_content = self._generate_markdown_content(
+        # markdown_content = self._generate_markdown_content(
-            cleaned_html=cleaned_html,
+        #     cleaned_html=cleaned_html,
-            html=html,
+        #     html=html,
-            url=url,
+        #     url=url,
-            success=success,
+        #     success=success,
-            **kwargs
+        #     **kwargs
-        )
+        # )
        return {
-            **markdown_content,
+            # **markdown_content,
            'cleaned_html': cleaned_html,
            'success': success,
            'media': media,
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,41 +1,40 @@
 import os
 import time
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
 import asyncio
 from pydantic import BaseModel, Field
 url = r'https://openai.com/api/pricing/'
 crawler = WebCrawler()
 crawler.warmup()
 from pydantic import BaseModel, Field
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
-result = crawler.run(
+from crawl4ai import AsyncWebCrawler
    url=url,
    word_count_threshold=1,
    extraction_strategy= LLMExtractionStrategy(
        # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
        provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), 
        schema=OpenAIModelFee.model_json_schema(),
        extraction_type="schema",
        instruction="From the crawled content, extract all mentioned model names along with their "\
            "fees for input and output tokens. Make sure not to miss anything in the entire content. "\
            'One extracted model JSON format should look like this: '\
            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
    ),
    bypass_cache=True,
 )
-model_fees = json.loads(result.extracted_content)
+async def main():
    # Use AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            word_count_threshold=1,
            extraction_strategy= LLMExtractionStrategy(
                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
                provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="From the crawled content, extract all mentioned model names along with their " \
                            "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
                            'One extracted model JSON format should look like this: ' \
                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
            ),
-print(len(model_fees))
+        )
        print("Success:", result.success)
        model_fees = json.loads(result.extracted_content)
        print(len(model_fees))
-with open(".data/data.json", "w", encoding="utf-8") as f:
+        with open(".data/data.json", "w", encoding="utf-8") as f:
-    f.write(result.extracted_content)
+            f.write(result.extracted_content)
 asyncio.run(main())
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -142,6 +142,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=1,
        page_timeout = 80000,
        extraction_strategy=LLMExtractionStrategy(
            provider=provider,
            api_token=api_token,
@@ -497,21 +498,21 @@ async def main():
    # Advanced examples
    # await extract_structured_data_using_css_extractor()
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    # Browser comparisons
-    await crawl_custom_browser_type()
+    # await crawl_custom_browser_type()
    # Performance testing
    # await speed_comparison()
    # Screenshot example
-    await capture_and_save_screenshot(
+    # await capture_and_save_screenshot(
-        "https://www.example.com",
+    #     "https://www.example.com",
-        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
-    )
+    # )
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
        all_commits = []
        js_next_page = """
-        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        (() => {
-        if (button) button.click();
+            const button = document.querySelector('a[data-testid="pagination-next-button"]');
            if (button) button.click();
        })();
        """
        for page in range(3):  # Crawl 3 pages
@@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():
 async def main():
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-    await simple_crawl()
+    # await simple_crawl()
-    await simple_example_with_running_js_code()
+    # await simple_example_with_running_js_code()
-    await simple_example_with_css_selector()
+    # await simple_example_with_css_selector()
-    # await use_proxy()
+    # # await use_proxy()
-    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    await extract_structured_data_using_css_extractor()
+    # await extract_structured_data_using_css_extractor()
    # LLM extraction examples
    # await extract_structured_data_using_llm()
--- a/docs/md/demo.md
+++ b/docs/md/demo.md
@@ -1,231 +0,0 @@
 # Interactive Demo for Crowler
 <div id="demo">
    <form id="crawlForm" class="terminal-form">
        <fieldset>
            <legend>Enter URL and Options</legend>
            <div class="form-group">
                <label for="url">Enter URL:</label>
                <input type="text" id="url" name="url" required>
            </div>
            <div class="form-group">
                <label for="screenshot">Get Screenshot:</label>
                <input type="checkbox" id="screenshot" name="screenshot">
            </div>
            <div class="form-group">
                <button class="btn btn-default" type="submit">Submit</button>
            </div>
        </fieldset>
    </form>
    <div id="loading" class="loading-message">
        <div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
    </div>
    <section id="response" class="response-section">
        <h2>Response</h2>
        <div class="tabs">
            <ul class="tab-list">
                <li class="tab-item" onclick="showTab('markdown')">Markdown</li>
                <li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
                <li class="tab-item" onclick="showTab('media')">Media</li>
                <li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
                <li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
                <li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
            </ul>
            <div class="tab-content" id="tab-markdown">
                <header>
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
                    </div>
                </header>
                <pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-cleanedHtml" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
                    </div>
                </header>
                <pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-media" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
                    </div>
                </header>
                <pre><code id="mediaContent" class="language-json hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-extractedContent" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
                    </div>
                </header>
                <pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-screenshot" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
                    </div>
                </header>
                <pre><img id="screenshotContent" /></pre>
            </div>
            <div class="tab-content" id="tab-pythonCode" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
                    </div>
                </header>
                <pre><code id="pythonCode" class="language-python hljs"></code></pre>
            </div>
        </div>
    </section>
    <div id="error" class="error-message" style="display: none; margin-top:1em;">
        <div class="terminal-alert terminal-alert-error"></div>
    </div>
    <script>
        function showTab(tabId) {
            const tabs = document.querySelectorAll('.tab-content');
            tabs.forEach(tab => tab.style.display = 'none');
            document.getElementById(`tab-${tabId}`).style.display = 'block';
        }
        function redo(codeBlock, codeText){
            codeBlock.classList.remove('hljs');
            codeBlock.removeAttribute('data-highlighted');
            // Set new code and re-highlight
            codeBlock.textContent = codeText;
            hljs.highlightBlock(codeBlock);
        }
        function copyToClipboard(elementId) {
            const content = document.getElementById(elementId).textContent;
            navigator.clipboard.writeText(content).then(() => {
                alert('Copied to clipboard');
            });
        }
        function downloadContent(elementId, filename) {
            const content = document.getElementById(elementId).textContent;
            const blob = new Blob([content], { type: 'text/plain' });
            const url = window.URL.createObjectURL(blob);
            const a = document.createElement('a');
            a.style.display = 'none';
            a.href = url;
            a.download = filename;
            document.body.appendChild(a);
            a.click();
            window.URL.revokeObjectURL(url);
            document.body.removeChild(a);
        }
        function downloadImage(elementId, filename) {
            const content = document.getElementById(elementId).src;
            const a = document.createElement('a');
            a.style.display = 'none';
            a.href = content;
            a.download = filename;
            document.body.appendChild(a);
            a.click();
            document.body.removeChild(a);
        }
        document.getElementById('crawlForm').addEventListener('submit', function(event) {
            event.preventDefault();
            document.getElementById('loading').style.display = 'block';
            document.getElementById('response').style.display = 'none';
            const url = document.getElementById('url').value;
            const screenshot = document.getElementById('screenshot').checked;
            const data = {
                urls: [url],
                bypass_cache: false,
                word_count_threshold: 5,
                screenshot: screenshot
            };
            fetch('https://crawl4ai.com/crawl', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json'
                },
                body: JSON.stringify(data)
            })
            .then(response => {
                if (!response.ok) {
                    if (response.status === 429) {
                        return response.json().then(err => { 
                            throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
                        });
                    }
                    throw new Error('Network response was not ok');
                }
                return response.json();
            })
            .then(data => {
                data = data.results[0]; // Only one URL is requested
                document.getElementById('loading').style.display = 'none';
                document.getElementById('response').style.display = 'block';
                redo(document.getElementById('markdownContent'), data.markdown);
                redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
                redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
                redo(document.getElementById('extractedContentContent'), data.extracted_content);
                if (screenshot) {
                    document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
                }
                const pythonCode = `
 from crawl4ai.web_crawler import WebCrawler
 crawler = WebCrawler()
 crawler.warmup()
 result = crawler.run(
    url='${url}',
    screenshot=${screenshot}
 )
 print(result)
                `;
                redo(document.getElementById('pythonCode'), pythonCode);
                document.getElementById('error').style.display = 'none';
            })
            .catch(error => {
                document.getElementById('loading').style.display = 'none';
                document.getElementById('error').style.display = 'block';
                let errorMessage = 'An unexpected error occurred. Please try again later.';
                if (error.status === 429) {
                    const details = error.details;
                    if (details.retry_after) {
                        errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
                    } else if (details.reset_at) {
                        const resetTime = new Date(details.reset_at);
                        const waitTime = Math.ceil((resetTime - new Date()) / 1000);
                        errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
                    } else {
                        errorMessage = `Rate limit exceeded. Please try again later.`;
                    }
                } else if (error.message) {
                    errorMessage = error.message;
                }
                document.querySelector('#error .terminal-alert').textContent = errorMessage;
            });
        });
    </script>
 </div>
--- a/main.py
+++ b/main.py
@@ -380,97 +380,97 @@ def read_root():
    return {"message": "Crawl4AI API service is running"}
-# @app.post("/crawl", dependencies=[Depends(verify_token)])
+@app.post("/crawl", dependencies=[Depends(verify_token)])
-# async def crawl(request: CrawlRequest) -> Dict[str, str]:
+async def crawl(request: CrawlRequest) -> Dict[str, str]:
-#     task_id = await crawler_service.submit_task(request)
+    task_id = await crawler_service.submit_task(request)
-#     return {"task_id": task_id}
+    return {"task_id": task_id}
-# @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
+@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
-# async def get_task_status(task_id: str):
+async def get_task_status(task_id: str):
-#     task_info = crawler_service.task_manager.get_task(task_id)
+    task_info = crawler_service.task_manager.get_task(task_id)
-#     if not task_info:
+    if not task_info:
-#         raise HTTPException(status_code=404, detail="Task not found")
+        raise HTTPException(status_code=404, detail="Task not found")
-#     response = {
+    response = {
-#         "status": task_info.status,
+        "status": task_info.status,
-#         "created_at": task_info.created_at,
+        "created_at": task_info.created_at,
-#     }
+    }
-#     if task_info.status == TaskStatus.COMPLETED:
+    if task_info.status == TaskStatus.COMPLETED:
-#         # Convert CrawlResult to dict for JSON response
+        # Convert CrawlResult to dict for JSON response
-#         if isinstance(task_info.result, list):
+        if isinstance(task_info.result, list):
-#             response["results"] = [result.dict() for result in task_info.result]
+            response["results"] = [result.dict() for result in task_info.result]
-#         else:
+        else:
-#             response["result"] = task_info.result.dict()
+            response["result"] = task_info.result.dict()
-#     elif task_info.status == TaskStatus.FAILED:
+    elif task_info.status == TaskStatus.FAILED:
-#         response["error"] = task_info.error
+        response["error"] = task_info.error
-#     return response
+    return response
-# @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
+@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
-# async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
+async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
-#     task_id = await crawler_service.submit_task(request)
+    task_id = await crawler_service.submit_task(request)
-#     # Wait up to 60 seconds for task completion
+    # Wait up to 60 seconds for task completion
-#     for _ in range(60):
+    for _ in range(60):
-#         task_info = crawler_service.task_manager.get_task(task_id)
+        task_info = crawler_service.task_manager.get_task(task_id)
-#         if not task_info:
+        if not task_info:
-#             raise HTTPException(status_code=404, detail="Task not found")
+            raise HTTPException(status_code=404, detail="Task not found")
-#         if task_info.status == TaskStatus.COMPLETED:
+        if task_info.status == TaskStatus.COMPLETED:
-#             # Return same format as /task/{task_id} endpoint
+            # Return same format as /task/{task_id} endpoint
-#             if isinstance(task_info.result, list):
+            if isinstance(task_info.result, list):
-#                 return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
+                return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
-#             return {"status": task_info.status, "result": task_info.result.dict()}
+            return {"status": task_info.status, "result": task_info.result.dict()}
-#         if task_info.status == TaskStatus.FAILED:
+        if task_info.status == TaskStatus.FAILED:
-#             raise HTTPException(status_code=500, detail=task_info.error)
+            raise HTTPException(status_code=500, detail=task_info.error)
-#         await asyncio.sleep(1)
+        await asyncio.sleep(1)
-#     # If we get here, task didn't complete within timeout
+    # If we get here, task didn't complete within timeout
-#     raise HTTPException(status_code=408, detail="Task timed out")
+    raise HTTPException(status_code=408, detail="Task timed out")
-# @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
+@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
-# async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
+async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
-#     try:
+    try:
-#         crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
+        crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
-#         extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
+        extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
-#         try:
+        try:
-#             if isinstance(request.urls, list):
+            if isinstance(request.urls, list):
-#                 results = await crawler.arun_many(
+                results = await crawler.arun_many(
-#                     urls=[str(url) for url in request.urls],
+                    urls=[str(url) for url in request.urls],
-#                     extraction_strategy=extraction_strategy,
+                    extraction_strategy=extraction_strategy,
-#                     js_code=request.js_code,
+                    js_code=request.js_code,
-#                     wait_for=request.wait_for,
+                    wait_for=request.wait_for,
-#                     css_selector=request.css_selector,
+                    css_selector=request.css_selector,
-#                     screenshot=request.screenshot,
+                    screenshot=request.screenshot,
-#                     magic=request.magic,
+                    magic=request.magic,
-#                     cache_mode=request.cache_mode,
+                    cache_mode=request.cache_mode,
-#                     session_id=request.session_id,
+                    session_id=request.session_id,
-#                     **request.extra,
+                    **request.extra,
-#                 )
+                )
-#                 return {"results": [result.dict() for result in results]}
+                return {"results": [result.dict() for result in results]}
-#             else:
+            else:
-#                 result = await crawler.arun(
+                result = await crawler.arun(
-#                     url=str(request.urls),
+                    url=str(request.urls),
-#                     extraction_strategy=extraction_strategy,
+                    extraction_strategy=extraction_strategy,
-#                     js_code=request.js_code,
+                    js_code=request.js_code,
-#                     wait_for=request.wait_for,
+                    wait_for=request.wait_for,
-#                     css_selector=request.css_selector,
+                    css_selector=request.css_selector,
-#                     screenshot=request.screenshot,
+                    screenshot=request.screenshot,
-#                     magic=request.magic,
+                    magic=request.magic,
-#                     cache_mode=request.cache_mode,
+                    cache_mode=request.cache_mode,
-#                     session_id=request.session_id,
+                    session_id=request.session_id,
-#                     **request.extra,
+                    **request.extra,
-#                 )
+                )
-#                 return {"result": result.dict()}
+                return {"result": result.dict()}
-#         finally:
+        finally:
-#             await crawler_service.crawler_pool.release(crawler)
+            await crawler_service.crawler_pool.release(crawler)
-#     except Exception as e:
+    except Exception as e:
-#         logger.error(f"Error in direct crawl: {str(e)}")
+        logger.error(f"Error in direct crawl: {str(e)}")
-#         raise HTTPException(status_code=500, detail=str(e))
+        raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
 async def health_check():
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
 nav:
  - Home: 'index.md'
  - 'Installation': 'basic/installation.md'
-  - 'Docker Deployment': 'basic/docker-deploymeny.md'
+  - 'Docker Deplotment': 'basic/docker-deploymeny.md'
  - 'Quick Start': 'basic/quickstart.md'
  - Changelog & Blog:
    - 'Blog Home': 'blog/index.md'
--- a/setup.py
+++ b/setup.py
@@ -57,6 +57,9 @@ setup(
    author_email="unclecode@kidocode.com",
    license="MIT",
    packages=find_packages(),
    package_data={
        'crawl4ai': ['js_snippet/*.js']  # This matches the exact path structure
    },
    install_requires=default_requirements
    + ["playwright", "aiofiles"],  # Added aiofiles
    extras_require={
Author	SHA1	Message	Date
UncleCode	d97a075082	Delete a.md	2024-12-25 19:43:39 +08:00
Haopeng138	bacbeb3ed4	Fix #340 example llm_extraction (#358 ) @Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.	2024-12-24 19:56:07 +08:00
UncleCode	ed7bc1909c	Bump version to 0.4.22	2024-12-15 19:49:38 +08:00
UncleCode	e9e5b5642d	Fix js_snipprt issue 0.4.21 bump to 0.4.22	2024-12-15 19:49:30 +08:00
UncleCode	7524aa7b5e	Feature: Add Markdown generation to CrawlerRunConfig - Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`. - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`. - Updated version number to 0.4.21 in `__version__.py`.	2024-12-13 21:51:38 +08:00
`@@ -1 +1,2 @@`
	`include requirements.txt`	`include requirements.txt`
		`recursive-include crawl4ai/js_snippet *.js`
`@@ -1,2 +1,2 @@`
	`# crawl4ai/_version.py`	`# crawl4ai/_version.py`
	`__version__ = "0.4.2"`	`__version__ = "0.4.22"`