Delete a.md

Fix #340 example llm_extraction (#358 )
@Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.
2024-12-25 19:43:39 +08:00 · 2024-12-24 19:56:07 +08:00 · 2024-12-15 19:49:38 +08:00 · 2024-12-15 19:49:30 +08:00 · 2024-12-13 21:51:38 +08:00
13 changed files with 165 additions and 4588 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
 include requirements.txt
 recursive-include crawl4ai/js_snippet *.js
--- a/a.md
+++ b/a.md
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.2"
+__version__ = "0.4.22"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -7,6 +7,7 @@ from .config import (
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy
 from .markdown_generation_strategy import MarkdownGenerationStrategy
 class BrowserConfig:
    """
@@ -269,6 +270,7 @@ class CrawlerRunConfig:
        word_count_threshold: int =  MIN_WORD_THRESHOLD ,
        extraction_strategy : ExtractionStrategy=None,  # Will default to NoExtractionStrategy if None
        chunking_strategy : ChunkingStrategy= None,    # Will default to RegexChunking if None
        markdown_generator : MarkdownGenerationStrategy = None,
        content_filter=None,
        cache_mode=None,
        session_id: str = None,
@@ -309,6 +311,7 @@ class CrawlerRunConfig:
        self.word_count_threshold = word_count_threshold
        self.extraction_strategy = extraction_strategy
        self.chunking_strategy = chunking_strategy
        self.markdown_generator = markdown_generator
        self.content_filter = content_filter
        self.cache_mode = cache_mode
        self.session_id = session_id
@@ -364,6 +367,7 @@ class CrawlerRunConfig:
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
            chunking_strategy=kwargs.get("chunking_strategy"),
            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
            cache_mode=kwargs.get("cache_mode"),
            session_id=kwargs.get("session_id"),
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -7,7 +7,8 @@ from pathlib import Path
 from typing import Optional, List, Union
 import json
 import asyncio
-from contextlib import nullcontext, asynccontextmanager
+# from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
 from .models import CrawlResult, MarkdownGenerationResult
 from .async_database import async_db_manager
 from .chunking_strategy import *
@@ -15,6 +16,7 @@ from .content_filter_strategy import *
 from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
 from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
 from .content_scraping_strategy import WebScrapingStrategy
 from .async_logger import AsyncLogger
 from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -133,16 +135,11 @@ class AsyncWebCrawler:
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
    @asynccontextmanager
    async def nullcontext(self):
        yield
    async def awarmup(self):
        """Initialize the crawler with warm-up sequence."""
        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
        self.ready = True
    @asynccontextmanager
    async def nullcontext(self):
        """异步空上下文管理器"""
@@ -323,7 +320,8 @@ class AsyncWebCrawler:
                        config=config,  # Pass the config object instead of individual parameters
                        screenshot=screenshot_data,
                        pdf_data=pdf_data,
-                        verbose=config.verbose
+                        verbose=config.verbose,
                        **kwargs
                    )
                    # Set response data
@@ -424,7 +422,8 @@ class AsyncWebCrawler:
                    css_selector=config.css_selector,
                    only_text=config.only_text,
                    image_description_min_word_threshold=config.image_description_min_word_threshold,
-                    content_filter=config.content_filter
+                    content_filter=config.content_filter,
                    **kwargs
                )
                if result is None:
@@ -435,16 +434,29 @@ class AsyncWebCrawler:
            except Exception as e:
                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
            # Extract results
            markdown_v2 = result.get("markdown_v2", None)
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
            markdown = sanitize_input_encode(result.get("markdown", ""))
            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
            fit_html = sanitize_input_encode(result.get("fit_html", ""))
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})
            # Markdown Generation
            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
            if not config.content_filter and not markdown_generator.content_filter:
                markdown_generator.content_filter = PruningContentFilter()
            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                cleaned_html=cleaned_html,
                base_url=url,
                # html2text_options=kwargs.get('html2text', {})
            )
            markdown_v2 = markdown_result
            markdown = sanitize_input_encode(markdown_result.raw_markdown)
            # Log processing completion
            self.logger.info(
                message="Processed {url:.50}... | Time: {timing}ms",
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')
-        markdown_content = self._generate_markdown_content(
+        # markdown_content = self._generate_markdown_content(
-            cleaned_html=cleaned_html,
+        #     cleaned_html=cleaned_html,
-            html=html,
+        #     html=html,
-            url=url,
+        #     url=url,
-            success=success,
+        #     success=success,
-            **kwargs
+        #     **kwargs
-        )
+        # )
        return {
-            **markdown_content,
+            # **markdown_content,
            'cleaned_html': cleaned_html,
            'success': success,
            'media': media,
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,23 +1,21 @@
 import os
 import time
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
 import asyncio
 from pydantic import BaseModel, Field
 url = r'https://openai.com/api/pricing/'
 crawler = WebCrawler()
 crawler.warmup()
 from pydantic import BaseModel, Field
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
-result = crawler.run(
+from crawl4ai import AsyncWebCrawler
 async def main():
    # Use AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            word_count_threshold=1,
            extraction_strategy= LLMExtractionStrategy(
@@ -30,12 +28,13 @@ result = crawler.run(
                            'One extracted model JSON format should look like this: ' \
                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
            ),
-    bypass_cache=True,
+
        )
-
+        print("Success:", result.success)
        model_fees = json.loads(result.extracted_content)
        print(len(model_fees))
        with open(".data/data.json", "w", encoding="utf-8") as f:
            f.write(result.extracted_content)
 asyncio.run(main())
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -142,6 +142,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=1,
        page_timeout = 80000,
        extraction_strategy=LLMExtractionStrategy(
            provider=provider,
            api_token=api_token,
@@ -497,21 +498,21 @@ async def main():
    # Advanced examples
    # await extract_structured_data_using_css_extractor()
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    # Browser comparisons
-    await crawl_custom_browser_type()
+    # await crawl_custom_browser_type()
    # Performance testing
    # await speed_comparison()
    # Screenshot example
-    await capture_and_save_screenshot(
+    # await capture_and_save_screenshot(
-        "https://www.example.com",
+    #     "https://www.example.com",
-        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
-    )
+    # )
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
        all_commits = []
        js_next_page = """
        (() => {
            const button = document.querySelector('a[data-testid="pagination-next-button"]');
            if (button) button.click();
        })();
        """
        for page in range(3):  # Crawl 3 pages
@@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():
 async def main():
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-    await simple_crawl()
+    # await simple_crawl()
-    await simple_example_with_running_js_code()
+    # await simple_example_with_running_js_code()
-    await simple_example_with_css_selector()
+    # await simple_example_with_css_selector()
-    # await use_proxy()
+    # # await use_proxy()
-    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    await extract_structured_data_using_css_extractor()
+    # await extract_structured_data_using_css_extractor()
    # LLM extraction examples
    # await extract_structured_data_using_llm()
--- a/docs/md/demo.md
+++ b/docs/md/demo.md
@@ -1,231 +0,0 @@
 # Interactive Demo for Crowler
 <div id="demo">
    <form id="crawlForm" class="terminal-form">
        <fieldset>
            <legend>Enter URL and Options</legend>
            <div class="form-group">
                <label for="url">Enter URL:</label>
                <input type="text" id="url" name="url" required>
            </div>
            <div class="form-group">
                <label for="screenshot">Get Screenshot:</label>
                <input type="checkbox" id="screenshot" name="screenshot">
            </div>
            <div class="form-group">
                <button class="btn btn-default" type="submit">Submit</button>
            </div>
        </fieldset>
    </form>
    <div id="loading" class="loading-message">
        <div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
    </div>
    <section id="response" class="response-section">
        <h2>Response</h2>
        <div class="tabs">
            <ul class="tab-list">
                <li class="tab-item" onclick="showTab('markdown')">Markdown</li>
                <li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
                <li class="tab-item" onclick="showTab('media')">Media</li>
                <li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
                <li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
                <li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
            </ul>
            <div class="tab-content" id="tab-markdown">
                <header>
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
                    </div>
                </header>
                <pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-cleanedHtml" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
                    </div>
                </header>
                <pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-media" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
                    </div>
                </header>
                <pre><code id="mediaContent" class="language-json hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-extractedContent" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
                    </div>
                </header>
                <pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
            </div>
            <div class="tab-content" id="tab-screenshot" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
                    </div>
                </header>
                <pre><img id="screenshotContent" /></pre>
            </div>
            <div class="tab-content" id="tab-pythonCode" style="display: none;">
                <header >
                    <div>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
                        <button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
                    </div>
                </header>
                <pre><code id="pythonCode" class="language-python hljs"></code></pre>
            </div>
        </div>
    </section>
    <div id="error" class="error-message" style="display: none; margin-top:1em;">
        <div class="terminal-alert terminal-alert-error"></div>
    </div>
    <script>
        function showTab(tabId) {
            const tabs = document.querySelectorAll('.tab-content');
            tabs.forEach(tab => tab.style.display = 'none');
            document.getElementById(`tab-${tabId}`).style.display = 'block';
        }
        function redo(codeBlock, codeText){
            codeBlock.classList.remove('hljs');
            codeBlock.removeAttribute('data-highlighted');
            // Set new code and re-highlight
            codeBlock.textContent = codeText;
            hljs.highlightBlock(codeBlock);
        }
        function copyToClipboard(elementId) {
            const content = document.getElementById(elementId).textContent;
            navigator.clipboard.writeText(content).then(() => {
                alert('Copied to clipboard');
            });
        }
        function downloadContent(elementId, filename) {
            const content = document.getElementById(elementId).textContent;
            const blob = new Blob([content], { type: 'text/plain' });
            const url = window.URL.createObjectURL(blob);
            const a = document.createElement('a');
            a.style.display = 'none';
            a.href = url;
            a.download = filename;
            document.body.appendChild(a);
            a.click();
            window.URL.revokeObjectURL(url);
            document.body.removeChild(a);
        }
        function downloadImage(elementId, filename) {
            const content = document.getElementById(elementId).src;
            const a = document.createElement('a');
            a.style.display = 'none';
            a.href = content;
            a.download = filename;
            document.body.appendChild(a);
            a.click();
            document.body.removeChild(a);
        }
        document.getElementById('crawlForm').addEventListener('submit', function(event) {
            event.preventDefault();
            document.getElementById('loading').style.display = 'block';
            document.getElementById('response').style.display = 'none';
            const url = document.getElementById('url').value;
            const screenshot = document.getElementById('screenshot').checked;
            const data = {
                urls: [url],
                bypass_cache: false,
                word_count_threshold: 5,
                screenshot: screenshot
            };
            fetch('https://crawl4ai.com/crawl', {
                method: 'POST',
                headers: {
                    'Content-Type': 'application/json'
                },
                body: JSON.stringify(data)
            })
            .then(response => {
                if (!response.ok) {
                    if (response.status === 429) {
                        return response.json().then(err => { 
                            throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
                        });
                    }
                    throw new Error('Network response was not ok');
                }
                return response.json();
            })
            .then(data => {
                data = data.results[0]; // Only one URL is requested
                document.getElementById('loading').style.display = 'none';
                document.getElementById('response').style.display = 'block';
                redo(document.getElementById('markdownContent'), data.markdown);
                redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
                redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
                redo(document.getElementById('extractedContentContent'), data.extracted_content);
                if (screenshot) {
                    document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
                }
                const pythonCode = `
 from crawl4ai.web_crawler import WebCrawler
 crawler = WebCrawler()
 crawler.warmup()
 result = crawler.run(
    url='${url}',
    screenshot=${screenshot}
 )
 print(result)
                `;
                redo(document.getElementById('pythonCode'), pythonCode);
                document.getElementById('error').style.display = 'none';
            })
            .catch(error => {
                document.getElementById('loading').style.display = 'none';
                document.getElementById('error').style.display = 'block';
                let errorMessage = 'An unexpected error occurred. Please try again later.';
                if (error.status === 429) {
                    const details = error.details;
                    if (details.retry_after) {
                        errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
                    } else if (details.reset_at) {
                        const resetTime = new Date(details.reset_at);
                        const waitTime = Math.ceil((resetTime - new Date()) / 1000);
                        errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
                    } else {
                        errorMessage = `Rate limit exceeded. Please try again later.`;
                    }
                } else if (error.message) {
                    errorMessage = error.message;
                }
                document.querySelector('#error .terminal-alert').textContent = errorMessage;
            });
        });
    </script>
 </div>
--- a/main.py
+++ b/main.py
@@ -380,97 +380,97 @@ def read_root():
    return {"message": "Crawl4AI API service is running"}
-# @app.post("/crawl", dependencies=[Depends(verify_token)])
+@app.post("/crawl", dependencies=[Depends(verify_token)])
-# async def crawl(request: CrawlRequest) -> Dict[str, str]:
+async def crawl(request: CrawlRequest) -> Dict[str, str]:
-#     task_id = await crawler_service.submit_task(request)
+    task_id = await crawler_service.submit_task(request)
-#     return {"task_id": task_id}
+    return {"task_id": task_id}
-# @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
+@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
-# async def get_task_status(task_id: str):
+async def get_task_status(task_id: str):
-#     task_info = crawler_service.task_manager.get_task(task_id)
+    task_info = crawler_service.task_manager.get_task(task_id)
-#     if not task_info:
+    if not task_info:
-#         raise HTTPException(status_code=404, detail="Task not found")
+        raise HTTPException(status_code=404, detail="Task not found")
-#     response = {
+    response = {
-#         "status": task_info.status,
+        "status": task_info.status,
-#         "created_at": task_info.created_at,
+        "created_at": task_info.created_at,
-#     }
+    }
-#     if task_info.status == TaskStatus.COMPLETED:
+    if task_info.status == TaskStatus.COMPLETED:
-#         # Convert CrawlResult to dict for JSON response
+        # Convert CrawlResult to dict for JSON response
-#         if isinstance(task_info.result, list):
+        if isinstance(task_info.result, list):
-#             response["results"] = [result.dict() for result in task_info.result]
+            response["results"] = [result.dict() for result in task_info.result]
-#         else:
+        else:
-#             response["result"] = task_info.result.dict()
+            response["result"] = task_info.result.dict()
-#     elif task_info.status == TaskStatus.FAILED:
+    elif task_info.status == TaskStatus.FAILED:
-#         response["error"] = task_info.error
+        response["error"] = task_info.error
-#     return response
+    return response
-# @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
+@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
-# async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
+async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
-#     task_id = await crawler_service.submit_task(request)
+    task_id = await crawler_service.submit_task(request)
-#     # Wait up to 60 seconds for task completion
+    # Wait up to 60 seconds for task completion
-#     for _ in range(60):
+    for _ in range(60):
-#         task_info = crawler_service.task_manager.get_task(task_id)
+        task_info = crawler_service.task_manager.get_task(task_id)
-#         if not task_info:
+        if not task_info:
-#             raise HTTPException(status_code=404, detail="Task not found")
+            raise HTTPException(status_code=404, detail="Task not found")
-#         if task_info.status == TaskStatus.COMPLETED:
+        if task_info.status == TaskStatus.COMPLETED:
-#             # Return same format as /task/{task_id} endpoint
+            # Return same format as /task/{task_id} endpoint
-#             if isinstance(task_info.result, list):
+            if isinstance(task_info.result, list):
-#                 return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
+                return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
-#             return {"status": task_info.status, "result": task_info.result.dict()}
+            return {"status": task_info.status, "result": task_info.result.dict()}
-#         if task_info.status == TaskStatus.FAILED:
+        if task_info.status == TaskStatus.FAILED:
-#             raise HTTPException(status_code=500, detail=task_info.error)
+            raise HTTPException(status_code=500, detail=task_info.error)
-#         await asyncio.sleep(1)
+        await asyncio.sleep(1)
-#     # If we get here, task didn't complete within timeout
+    # If we get here, task didn't complete within timeout
-#     raise HTTPException(status_code=408, detail="Task timed out")
+    raise HTTPException(status_code=408, detail="Task timed out")
-# @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
+@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
-# async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
+async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
-#     try:
+    try:
-#         crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
+        crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
-#         extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
+        extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
-#         try:
+        try:
-#             if isinstance(request.urls, list):
+            if isinstance(request.urls, list):
-#                 results = await crawler.arun_many(
+                results = await crawler.arun_many(
-#                     urls=[str(url) for url in request.urls],
+                    urls=[str(url) for url in request.urls],
-#                     extraction_strategy=extraction_strategy,
+                    extraction_strategy=extraction_strategy,
-#                     js_code=request.js_code,
+                    js_code=request.js_code,
-#                     wait_for=request.wait_for,
+                    wait_for=request.wait_for,
-#                     css_selector=request.css_selector,
+                    css_selector=request.css_selector,
-#                     screenshot=request.screenshot,
+                    screenshot=request.screenshot,
-#                     magic=request.magic,
+                    magic=request.magic,
-#                     cache_mode=request.cache_mode,
+                    cache_mode=request.cache_mode,
-#                     session_id=request.session_id,
+                    session_id=request.session_id,
-#                     **request.extra,
+                    **request.extra,
-#                 )
+                )
-#                 return {"results": [result.dict() for result in results]}
+                return {"results": [result.dict() for result in results]}
-#             else:
+            else:
-#                 result = await crawler.arun(
+                result = await crawler.arun(
-#                     url=str(request.urls),
+                    url=str(request.urls),
-#                     extraction_strategy=extraction_strategy,
+                    extraction_strategy=extraction_strategy,
-#                     js_code=request.js_code,
+                    js_code=request.js_code,
-#                     wait_for=request.wait_for,
+                    wait_for=request.wait_for,
-#                     css_selector=request.css_selector,
+                    css_selector=request.css_selector,
-#                     screenshot=request.screenshot,
+                    screenshot=request.screenshot,
-#                     magic=request.magic,
+                    magic=request.magic,
-#                     cache_mode=request.cache_mode,
+                    cache_mode=request.cache_mode,
-#                     session_id=request.session_id,
+                    session_id=request.session_id,
-#                     **request.extra,
+                    **request.extra,
-#                 )
+                )
-#                 return {"result": result.dict()}
+                return {"result": result.dict()}
-#         finally:
+        finally:
-#             await crawler_service.crawler_pool.release(crawler)
+            await crawler_service.crawler_pool.release(crawler)
-#     except Exception as e:
+    except Exception as e:
-#         logger.error(f"Error in direct crawl: {str(e)}")
+        logger.error(f"Error in direct crawl: {str(e)}")
-#         raise HTTPException(status_code=500, detail=str(e))
+        raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
 async def health_check():
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
 nav:
  - Home: 'index.md'
  - 'Installation': 'basic/installation.md'
-  - 'Docker Deployment': 'basic/docker-deploymeny.md'
+  - 'Docker Deplotment': 'basic/docker-deploymeny.md'
  - 'Quick Start': 'basic/quickstart.md'
  - Changelog & Blog:
    - 'Blog Home': 'blog/index.md'
--- a/setup.py
+++ b/setup.py
@@ -57,6 +57,9 @@ setup(
    author_email="unclecode@kidocode.com",
    license="MIT",
    packages=find_packages(),
    package_data={
        'crawl4ai': ['js_snippet/*.js']  # This matches the exact path structure
    },
    install_requires=default_requirements
    + ["playwright", "aiofiles"],  # Added aiofiles
    extras_require={
Author	SHA1	Message	Date
UncleCode	d97a075082	Delete a.md	2024-12-25 19:43:39 +08:00
Haopeng138	bacbeb3ed4	Fix #340 example llm_extraction (#358 ) @Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.	2024-12-24 19:56:07 +08:00
UncleCode	ed7bc1909c	Bump version to 0.4.22	2024-12-15 19:49:38 +08:00
UncleCode	e9e5b5642d	Fix js_snipprt issue 0.4.21 bump to 0.4.22	2024-12-15 19:49:30 +08:00
UncleCode	7524aa7b5e	Feature: Add Markdown generation to CrawlerRunConfig - Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`. - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`. - Updated version number to 0.4.21 in `__version__.py`.	2024-12-13 21:51:38 +08:00
`@@ -1 +1,2 @@`
	`include requirements.txt`	`include requirements.txt`
		`recursive-include crawl4ai/js_snippet *.js`
`@@ -1,2 +1,2 @@`
	`# crawl4ai/_version.py`	`# crawl4ai/_version.py`
	`__version__ = "0.4.2"`	`__version__ = "0.4.22"`